Chapter 3 Instrumental Variables

library(dplyr, warn.conflicts = FALSE)
library(stargazer)
library(lmwr)
library(tidyr)

3.1 Introduction

3.2 A Confounded Model

3.2.1 Confounded Model DAG

3.2.2 Confounded Linear Model

3.2.3 Simulation of Confounded Data

Here I create a function that returns simulated data as a data frame.

set.seed(123456789)

get_data <- function() {
  N <- 1000
  a <- 2
  b <- 3
  c <- 2
  e <- 3
  f <- -1
  d <- 4
  z <- runif(N)
  u_1 <- rnorm(N, mean = 0, sd = 3)
  u_2 <- rnorm(N, mean = 0, sd = 1)
  
  x <- f + d * z + u_2 + c * u_1
  y <- a + b * x + e * u_1
  data.frame(x, y, z)
}

df <- get_data()

lm1 <- lm(y ~ x, data = df)

lm1
## 
## Call:
## lm(formula = y ~ x, data = df)
## 
## Coefficients:
## (Intercept)            x  
##      0.5114       4.4090

3.3 IV Estimator

3.3.1 Graph Algebra of IV Estimator

3.3.2 Properties of IV Estimator

3.3.3 IV Estimator with Standard Algebra

3.3.4 Simulation of an IV Estimator

3.3.5 IV Esimator with Matrix Algebra

3.3.6 Two-Stage Least Squares

3.3.7 IV Estimator in R

X <- cbind(1, df$x)
Z <- cbind(1, df$z)
y <- df$y

beta_hat_ols <- solve(t(X) %*% X) %*% t(X) %*% y
beta_hat_iv <- solve(t(Z) %*% X) %*% t(Z) %*% y

beta_hat_ols
##           [,1]
## [1,] 0.5113529
## [2,] 4.4090257
beta_hat_iv
##          [,1]
## [1,] 1.715181
## [2,] 3.015432
lm_iv <- function(y, X_in, Z_in = X_in, reps = 100,
                  min_in = 0.05, max_in = 0.95) {
  
  set.seed(123456789)
  X <- cbind(1, X_in)
  Z <- cbind(1, Z_in)
  
  bs_mat <- matrix(NA, reps, dim(X)[2])
  
  N <- length(y)
  for (r in 1:reps) {
    # sample(1:N, size = N, replace = TRUE)
    index_bs <- round(runif(N, min = 1, max = N))
    y_bs <- y[index_bs]
    X_bs <- X[index_bs, ]
    Z_bs <- Z[index_bs, ]
    bs_mat[r, ] <- solve(t(Z_bs) %*% X_bs) %*% t(Z_bs) %*% y_bs
  }

  # Present results
  tab_res <- matrix(NA, dim(X)[2], 4)
  tab_res[, 1] <- colMeans(bs_mat)
  for (j in 1:dim(X)[2]) {
    tab_res[j, 2] <- sd(bs_mat[ , j])
    tab_res[j, 3] <- quantile(bs_mat[ , j], min_in)
    tab_res[j, 4] <- quantile(bs_mat[ , j], max_in)
  }
  colnames(tab_res) <- c("coef", "sd", as.character(min_in),
                         as.character(max_in))
  return(tab_res)
  
}

print(lm_iv(df$y, df$x), digits = 3)
##       coef     sd  0.05  0.95
## [1,] 0.513 0.0729 0.396 0.643
## [2,] 4.409 0.0109 4.394 4.427
print(lm_iv(df$y, df$x, df$z), digits = 3)
##      coef    sd 0.05 0.95
## [1,] 1.83 0.377 1.30 2.43
## [2,] 2.96 0.286 2.45 3.37

3.4 Returns to schooling

nlsm <-
  nlsm %>%
  mutate(exp = age76 - ed76 - 6,
         exp2 = exp^2/100) %>%
  filter(!is.na(lwage76))

fm <- list()
fm[[1]] <- lm(lwage76 ~ ed76, data = nlsm)
fm[[2]] <- update(fm[[1]], ~ . + exp + exp2)
fm[[3]] <- update(fm[[2]], ~ . + black + reg76r)
fm[[4]] <- update(fm[[3]], ~ . + smsa76r + smsa66r + reg662 + reg663 +
                    reg664 + reg665 + reg666 + reg667 + reg668 + reg669)
fm[[5]] <- update(fm[[4]], ~ . - ed76 + nearc4)
fm[[6]] <- update(fm[[5]], ed76 ~ .)
stargazer(fm, type = "html", keep.stat = c("n", "rsq"),
          omit = "^(reg66|smsa)")
Dependent variable:
lwage76 ed76
(1) (2) (3) (4) (5) (6)
ed76 0.052*** 0.093*** 0.078*** 0.075***
(0.003) (0.004) (0.004) (0.003)
exp 0.090*** 0.085*** 0.085*** 0.054*** -0.413***
(0.007) (0.007) (0.007) (0.007) (0.034)
exp2 -0.249*** -0.234*** -0.229*** -0.222*** 0.087
(0.034) (0.032) (0.032) (0.034) (0.165)
black -0.178*** -0.199*** -0.270*** -0.936***
(0.018) (0.018) (0.019) (0.094)
reg76r -0.150*** -0.148*** -0.151*** -0.052
(0.015) (0.026) (0.028) (0.135)
nearc4 0.042** 0.320***
(0.018) (0.088)
Constant 5.571*** 4.469*** 4.796*** 4.621*** 5.854*** 16.638***
(0.039) (0.069) (0.069) (0.074) (0.050) (0.241)
Observations 3,010 3,010 3,010 3,010 3,010 3,010
R2 0.099 0.196 0.265 0.300 0.195 0.477
Note: p<0.1; p<0.05; p<0.01
fm[[5]]$coefficients[2]/fm[[6]]$coefficients[2]
##        exp 
## -0.1309503
fm[[5]]$coefficients[["nearc4"]]/fm[[6]]$coefficients[["nearc4"]]
## [1] 0.1315038
iv_fit <- function(y_in, X_in, Z_in = X_in) {
  # Convert supplied data to matrices
  y <- as.matrix(y_in, ncol=1)
  X <- cbind(1, as.matrix(X_in))
  Z <- cbind(1, as.matrix(Z_in))

  res <- solve(t(Z) %*% X) %*% t(Z) %*% y
  rownames(res) <- c("intercept", colnames(X_in))
  return(res)
}

iv_bs <- function(i, y, X_in, Z_in) {

  y_vec <- as.matrix(y, ncol=1)
  N <- length(y_vec)

  index_bs <- round(runif(N, min = 1, max = N))
  y_bs <- y_vec[index_bs, ]
  X_bs <- X_in[index_bs, ]
  Z_bs <- Z_in[index_bs, ]

  as_tibble(t(iv_fit(y_bs, X_bs, Z_bs)))
}

lm_iv <- function(y, X_in, Z_in = X_in, reps = 100) {

  set.seed(123456789)
  bind_rows(lapply(1:reps, iv_bs, y, X_in, Z_in))
}

get_summ_stats <- function(x, p_lower, p_upper) {
  df_temp <- tibble(mean(x),
                    sd(x),
                    quantile(x, p_lower),
                    quantile(x, p_upper))

  names(df_temp) <- c("coef", "sd",
                         paste0("p", as.character(p_lower * 100)),
                         paste0("p", as.character(p_upper * 100)))
  return(df_temp)
}

get_tab_res <- function(bs_mat, p_lower = 0.05, p_upper = 0.95) {
  bind_rows(lapply(bs_mat, get_summ_stats, p_lower, p_upper),
            .id = "var")
}

y <-
  nlsm %>%
  select(lwage76)

X <-
  nlsm %>%
  select(ed76, exp, exp2, black, reg76r,
                     smsa76r, smsa66r, reg662:reg669)
Z1 <-
  nlsm %>%
  mutate(age2 = age76^2) %>%
  select(nearc4, age76, age2, black, reg76r,
         smsa76r, smsa66r, reg662:reg669)

Z2 <-
  nlsm %>%
  mutate(age2 = age76^2) %>%
  select(momdad14, age76, age2, black, reg76r,
         smsa76r, smsa66r, reg662:reg669)

res_Z1 <- lm_iv(y, X, Z1, reps = 1000)
get_tab_res(res_Z1)
## # A tibble: 16 x 5
##    var          coef     sd        p5      p95
##    <chr>       <dbl>  <dbl>     <dbl>    <dbl>
##  1 intercept  3.98   0.824   2.78      4.95   
##  2 ed76       0.132  0.0730  0.0483    0.240  
##  3 exp        0.0602 0.0360  0.00975   0.102  
##  4 exp2      -0.0996 0.186  -0.319     0.165  
##  5 black     -0.120  0.102  -0.244     0.0333 
##  6 reg76r    -0.145  0.0311 -0.193    -0.0911 
##  7 smsa76r    0.0807 0.0792 -0.0351    0.170  
##  8 smsa66r    0.0318 0.0244 -0.00428   0.0685 
##  9 reg662     0.0770 0.0524  0.000413  0.148  
## 10 reg663     0.128  0.0477  0.0545    0.196  
## 11 reg664     0.0272 0.0651 -0.0645    0.116  
## 12 reg665     0.140  0.0502  0.0608    0.220  
## 13 reg666     0.151  0.0516  0.0675    0.230  
## 14 reg667     0.121  0.0542  0.0362    0.205  
## 15 reg668    -0.102  0.0853 -0.231     0.00675
## 16 reg669     0.0908 0.0624 -0.000498  0.173
res_Z2 <- lm_iv(y, X, Z2, reps = 1000)
bs_diff <- (res_Z1 - res_Z2)[, "ed76"]
summary(bs_diff)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.25500 -0.02737  0.00887  0.01738  0.05257  0.91591
# 3.4.5 Concerns with Distance to College
summarise_by <- function(.data, .by, ..., .groups = NULL) {
  .data %>%
    summarise(across(c(ed76, exp, black, south66,
                       smsa66r, reg76r, smsa76r), mean),
              .groups = "keep") %>%
    pivot_longer(cols = everything())
    pivot_wider(names_from = .by, values_from = value)
}

nlsm %>%
  mutate(nearc4 = case_when(nearc4 == 0 ~ "Not near college",
                            nearc4 == 1 ~ "Near college")) %>%
  group_by(nearc4) %>%
  summarise(across(c(ed76, exp, black, south66,
                     smsa66r, reg76r, smsa76r), mean))
## # A tibble: 2 x 8
##   nearc4            ed76   exp black south66 smsa66r reg76r smsa76r
##   <chr>            <dbl> <dbl> <dbl>   <dbl>   <dbl>  <dbl>   <dbl>
## 1 Near college      13.5  8.68 0.212   0.329   0.799  0.329   0.822
## 2 Not near college  12.7  9.23 0.280   0.598   0.328  0.563   0.479