Chapter 3 Instrumental Variables
library(dplyr, warn.conflicts = FALSE)
library(stargazer)
library(lmwr)
library(tidyr)
3.1 Introduction
3.2 A Confounded Model
3.2.1 Confounded Model DAG
3.2.2 Confounded Linear Model
3.2.3 Simulation of Confounded Data
Here I create a function that returns simulated data as a data frame.
set.seed(123456789)
<- function() {
get_data <- 1000
N <- 2
a <- 3
b <- 2
c <- 3
e <- -1
f <- 4
d <- runif(N)
z <- rnorm(N, mean = 0, sd = 3)
u_1 <- rnorm(N, mean = 0, sd = 1)
u_2
<- f + d * z + u_2 + c * u_1
x <- a + b * x + e * u_1
y data.frame(x, y, z)
}
<- get_data()
df
<- lm(y ~ x, data = df)
lm1
lm1
##
## Call:
## lm(formula = y ~ x, data = df)
##
## Coefficients:
## (Intercept) x
## 0.5114 4.4090
3.3 IV Estimator
3.3.1 Graph Algebra of IV Estimator
3.3.2 Properties of IV Estimator
3.3.3 IV Estimator with Standard Algebra
3.3.4 Simulation of an IV Estimator
3.3.5 IV Esimator with Matrix Algebra
3.3.6 Two-Stage Least Squares
3.3.7 IV Estimator in R
<- cbind(1, df$x)
X <- cbind(1, df$z)
Z <- df$y
y
<- solve(t(X) %*% X) %*% t(X) %*% y
beta_hat_ols <- solve(t(Z) %*% X) %*% t(Z) %*% y
beta_hat_iv
beta_hat_ols
## [,1]
## [1,] 0.5113529
## [2,] 4.4090257
beta_hat_iv
## [,1]
## [1,] 1.715181
## [2,] 3.015432
<- function(y, X_in, Z_in = X_in, reps = 100,
lm_iv min_in = 0.05, max_in = 0.95) {
set.seed(123456789)
<- cbind(1, X_in)
X <- cbind(1, Z_in)
Z
<- matrix(NA, reps, dim(X)[2])
bs_mat
<- length(y)
N for (r in 1:reps) {
# sample(1:N, size = N, replace = TRUE)
<- round(runif(N, min = 1, max = N))
index_bs <- y[index_bs]
y_bs <- X[index_bs, ]
X_bs <- Z[index_bs, ]
Z_bs <- solve(t(Z_bs) %*% X_bs) %*% t(Z_bs) %*% y_bs
bs_mat[r, ]
}
# Present results
<- matrix(NA, dim(X)[2], 4)
tab_res 1] <- colMeans(bs_mat)
tab_res[, for (j in 1:dim(X)[2]) {
2] <- sd(bs_mat[ , j])
tab_res[j, 3] <- quantile(bs_mat[ , j], min_in)
tab_res[j, 4] <- quantile(bs_mat[ , j], max_in)
tab_res[j,
}colnames(tab_res) <- c("coef", "sd", as.character(min_in),
as.character(max_in))
return(tab_res)
}
print(lm_iv(df$y, df$x), digits = 3)
## coef sd 0.05 0.95
## [1,] 0.513 0.0729 0.396 0.643
## [2,] 4.409 0.0109 4.394 4.427
print(lm_iv(df$y, df$x, df$z), digits = 3)
## coef sd 0.05 0.95
## [1,] 1.83 0.377 1.30 2.43
## [2,] 2.96 0.286 2.45 3.37
3.4 Returns to schooling
<-
nlsm %>%
nlsm mutate(exp = age76 - ed76 - 6,
exp2 = exp^2/100) %>%
filter(!is.na(lwage76))
<- list()
fm 1]] <- lm(lwage76 ~ ed76, data = nlsm)
fm[[2]] <- update(fm[[1]], ~ . + exp + exp2)
fm[[3]] <- update(fm[[2]], ~ . + black + reg76r)
fm[[4]] <- update(fm[[3]], ~ . + smsa76r + smsa66r + reg662 + reg663 +
fm[[+ reg665 + reg666 + reg667 + reg668 + reg669)
reg664 5]] <- update(fm[[4]], ~ . - ed76 + nearc4)
fm[[6]] <- update(fm[[5]], ed76 ~ .) fm[[
stargazer(fm, type = "html", keep.stat = c("n", "rsq"),
omit = "^(reg66|smsa)")
Dependent variable: | ||||||
lwage76 | ed76 | |||||
(1) | (2) | (3) | (4) | (5) | (6) | |
ed76 | 0.052*** | 0.093*** | 0.078*** | 0.075*** | ||
(0.003) | (0.004) | (0.004) | (0.003) | |||
exp | 0.090*** | 0.085*** | 0.085*** | 0.054*** | -0.413*** | |
(0.007) | (0.007) | (0.007) | (0.007) | (0.034) | ||
exp2 | -0.249*** | -0.234*** | -0.229*** | -0.222*** | 0.087 | |
(0.034) | (0.032) | (0.032) | (0.034) | (0.165) | ||
black | -0.178*** | -0.199*** | -0.270*** | -0.936*** | ||
(0.018) | (0.018) | (0.019) | (0.094) | |||
reg76r | -0.150*** | -0.148*** | -0.151*** | -0.052 | ||
(0.015) | (0.026) | (0.028) | (0.135) | |||
nearc4 | 0.042** | 0.320*** | ||||
(0.018) | (0.088) | |||||
Constant | 5.571*** | 4.469*** | 4.796*** | 4.621*** | 5.854*** | 16.638*** |
(0.039) | (0.069) | (0.069) | (0.074) | (0.050) | (0.241) | |
Observations | 3,010 | 3,010 | 3,010 | 3,010 | 3,010 | 3,010 |
R2 | 0.099 | 0.196 | 0.265 | 0.300 | 0.195 | 0.477 |
Note: | p<0.1; p<0.05; p<0.01 |
5]]$coefficients[2]/fm[[6]]$coefficients[2] fm[[
## exp
## -0.1309503
5]]$coefficients[["nearc4"]]/fm[[6]]$coefficients[["nearc4"]] fm[[
## [1] 0.1315038
<- function(y_in, X_in, Z_in = X_in) {
iv_fit # Convert supplied data to matrices
<- as.matrix(y_in, ncol=1)
y <- cbind(1, as.matrix(X_in))
X <- cbind(1, as.matrix(Z_in))
Z
<- solve(t(Z) %*% X) %*% t(Z) %*% y
res rownames(res) <- c("intercept", colnames(X_in))
return(res)
}
<- function(i, y, X_in, Z_in) {
iv_bs
<- as.matrix(y, ncol=1)
y_vec <- length(y_vec)
N
<- round(runif(N, min = 1, max = N))
index_bs <- y_vec[index_bs, ]
y_bs <- X_in[index_bs, ]
X_bs <- Z_in[index_bs, ]
Z_bs
as_tibble(t(iv_fit(y_bs, X_bs, Z_bs)))
}
<- function(y, X_in, Z_in = X_in, reps = 100) {
lm_iv
set.seed(123456789)
bind_rows(lapply(1:reps, iv_bs, y, X_in, Z_in))
}
<- function(x, p_lower, p_upper) {
get_summ_stats <- tibble(mean(x),
df_temp sd(x),
quantile(x, p_lower),
quantile(x, p_upper))
names(df_temp) <- c("coef", "sd",
paste0("p", as.character(p_lower * 100)),
paste0("p", as.character(p_upper * 100)))
return(df_temp)
}
<- function(bs_mat, p_lower = 0.05, p_upper = 0.95) {
get_tab_res bind_rows(lapply(bs_mat, get_summ_stats, p_lower, p_upper),
.id = "var")
}
<-
y %>%
nlsm select(lwage76)
<-
X %>%
nlsm select(ed76, exp, exp2, black, reg76r,
:reg669)
smsa76r, smsa66r, reg662<-
Z1 %>%
nlsm mutate(age2 = age76^2) %>%
select(nearc4, age76, age2, black, reg76r,
:reg669)
smsa76r, smsa66r, reg662
<-
Z2 %>%
nlsm mutate(age2 = age76^2) %>%
select(momdad14, age76, age2, black, reg76r,
:reg669)
smsa76r, smsa66r, reg662
<- lm_iv(y, X, Z1, reps = 1000)
res_Z1 get_tab_res(res_Z1)
## # A tibble: 16 x 5
## var coef sd p5 p95
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3.98 0.824 2.78 4.95
## 2 ed76 0.132 0.0730 0.0483 0.240
## 3 exp 0.0602 0.0360 0.00975 0.102
## 4 exp2 -0.0996 0.186 -0.319 0.165
## 5 black -0.120 0.102 -0.244 0.0333
## 6 reg76r -0.145 0.0311 -0.193 -0.0911
## 7 smsa76r 0.0807 0.0792 -0.0351 0.170
## 8 smsa66r 0.0318 0.0244 -0.00428 0.0685
## 9 reg662 0.0770 0.0524 0.000413 0.148
## 10 reg663 0.128 0.0477 0.0545 0.196
## 11 reg664 0.0272 0.0651 -0.0645 0.116
## 12 reg665 0.140 0.0502 0.0608 0.220
## 13 reg666 0.151 0.0516 0.0675 0.230
## 14 reg667 0.121 0.0542 0.0362 0.205
## 15 reg668 -0.102 0.0853 -0.231 0.00675
## 16 reg669 0.0908 0.0624 -0.000498 0.173
<- lm_iv(y, X, Z2, reps = 1000)
res_Z2 <- (res_Z1 - res_Z2)[, "ed76"]
bs_diff summary(bs_diff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.25500 -0.02737 0.00887 0.01738 0.05257 0.91591
# 3.4.5 Concerns with Distance to College
<- function(.data, .by, ..., .groups = NULL) {
summarise_by %>%
.data summarise(across(c(ed76, exp, black, south66,
smsa66r, reg76r, smsa76r), mean),.groups = "keep") %>%
pivot_longer(cols = everything())
pivot_wider(names_from = .by, values_from = value)
}
%>%
nlsm mutate(nearc4 = case_when(nearc4 == 0 ~ "Not near college",
== 1 ~ "Near college")) %>%
nearc4 group_by(nearc4) %>%
summarise(across(c(ed76, exp, black, south66,
smsa66r, reg76r, smsa76r), mean))
## # A tibble: 2 x 8
## nearc4 ed76 exp black south66 smsa66r reg76r smsa76r
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Near college 13.5 8.68 0.212 0.329 0.799 0.329 0.822
## 2 Not near college 12.7 9.23 0.280 0.598 0.328 0.563 0.479