Chapter 2 Multiple Regression
library(stargazer)
library(dplyr, warn.conflicts = FALSE)
library(lmwr)
library(xtable)
library(ggplot2)
2.1 Introduction
2.2 Long and Short Regression
2.2.1 Using Short Regression
2.2.2 Independent Explanatory Variables
2.2.3 Dependent Explanatory Variables
2.2.4 Simulation with Multiple Explanatory Variables
<- 1000
N <- 2
a <- 3
b <- 4 c
set.seed(123456789)
<- rnorm(N) u_x
<- function(alpha = 0, return_data = FALSE) {
get_lms <- (1 - alpha) * runif(N) + alpha * u_x
x <- (1 - alpha) * runif(N) + alpha * u_x
w if (alpha == 0) u <<- rnorm(N)
<- a + b* x + c * w + u
y if (return_data) {
data.frame(y, x, w)
else {
} list(lm(y ~ x), lm(y ~ x + w))
} }
set.seed(123456789)
<- rnorm(N)
u_x <- lapply(list(0, 0.5, 0.95), get_lms, return_data = FALSE) lms
library(stargazer)
stargazer(lms, type = "html", keep.stat = c("n", "rsq"), digits = 2)
Dependent variable: | ||||||
y | ||||||
(1) | (2) | (3) | (4) | (5) | (6) | |
x | 3.14*** | 2.81*** | 6.84*** | 2.86*** | 7.01*** | 0.67 |
(0.17) | (0.11) | (0.08) | (0.17) | (0.03) | (1.59) | |
w | 4.05*** | 4.16*** | 6.35*** | |||
(0.11) | (0.16) | (1.59) | ||||
Constant | 3.98*** | 2.15*** | 2.14*** | 2.07*** | 2.07*** | 2.08*** |
(0.10) | (0.08) | (0.05) | (0.04) | (0.03) | (0.03) | |
Observations | 1,000 | 1,000 | 1,000 | 1,000 | 1,000 | 1,000 |
R2 | 0.26 | 0.68 | 0.88 | 0.93 | 0.98 | 0.98 |
Note: | p<0.1; p<0.05; p<0.01 |
2.2.5 Matrix Algebra of Short Regression
set.seed(123456789)
<- rnorm(N)
u_x <- lapply(list(0, 0.5, 0.95), get_lms, return_data = TRUE) dfs
with(dfs[[1]], cov(x, w))
## [1] 0.007019082
with(dfs[[2]], cov(x, w))
## [1] 0.2557656
with(dfs[[2]], t(x) %*% w)
## [,1]
## [1,] 318.8261
2.3 Collinearity and Multicollinearity
2.3.1 Matrix Algebra of Multicollinearity
2.3.2 Understanding Multicollinearity with R
<- with(dfs[[3]], cbind(1, x, w))
X2 solve(t(X2) %*% X2) %*% t(X2) %*% u
## [,1]
## 0.0766164
## x -2.3321265
## w 2.3462160
mean(u)
## [1] 0.07635957
with(dfs[[3]], cov(w, u))
## [1] 0.01413717
1/N * t(X2) %*% u
## [,1]
## 0.07635957
## x 0.01593319
## w 0.01687792
1/det(t(X2) %*% X2)
## [1] 2.610265e-06
2.4 Returns to Schooling
2.4.1 Multiple Regression of Returns to Schooling
2.4.2 NLSM Data
<-
nlsm_mod %>%
nlsm filter(!is.na(lwage76)) %>%
mutate(exp = age76 - ed76 - 6,
exp2 = exp^2/100)
2.4.3 OLS Estimates of Returns to Schooling
<- list()
fm 1]] <- lm(lwage76 ~ ed76, data = nlsm_mod)
fm[[2]] <- update(fm[[1]], ~ . + exp + exp2)
fm[[3]] <- update(fm[[2]], ~ . + black + reg76r)
fm[[4]] <- update(fm[[3]], ~ . + smsa76r + smsa66r + reg662 + reg663 +
fm[[+ reg665 + reg666 + reg667 + reg668 + reg669)
reg664
stargazer(fm, type = "html", keep.stat = c("n", "rsq"),
omit = "^(reg66|smsa)")
Dependent variable: | ||||
lwage76 | ||||
(1) | (2) | (3) | (4) | |
ed76 | 0.052*** | 0.093*** | 0.078*** | 0.075*** |
(0.003) | (0.004) | (0.004) | (0.003) | |
exp | 0.090*** | 0.085*** | 0.085*** | |
(0.007) | (0.007) | (0.007) | ||
exp2 | -0.249*** | -0.234*** | -0.229*** | |
(0.034) | (0.032) | (0.032) | ||
black | -0.178*** | -0.199*** | ||
(0.018) | (0.018) | |||
reg76r | -0.150*** | -0.148*** | ||
(0.015) | (0.026) | |||
Constant | 5.571*** | 4.469*** | 4.796*** | 4.621*** |
(0.039) | (0.069) | (0.069) | (0.074) | |
Observations | 3,010 | 3,010 | 3,010 | 3,010 |
R2 | 0.099 | 0.196 | 0.265 | 0.300 |
Note: | p<0.1; p<0.05; p<0.01 |
2.5 Causal Pathways
2.5.1 Dual-Path Model
2.5.2 Simulation of Dual-Path Model
set.seed(123456789)
<- 50
N <- 1
a <- 0
b <- 3
c <- 4
d
<- function() {
gen_data <- round(runif(N))
x <- runif(N)
u_w <- d * x + u_w
w <- rnorm(N)
u <- a + b* x + c * w + u
y data.frame(y, x, w)
}
<- gen_data()
df
<- list()
fm 1]] <- lm(y ~ x, data = df)
fm[[2]] <- update(fm[[1]], ~ . + w) fm[[
stargazer(fm, type = "html", keep.stat = c("n", "rsq"),
omit = "^(reg66|smsa)")
Dependent variable: | ||
y | ||
(1) | (2) | |
x | 12.241*** | -5.148*** |
(0.413) | (1.885) | |
w | 4.285*** | |
(0.460) | ||
Constant | 2.587*** | 0.564* |
(0.298) | (0.281) | |
Observations | 50 | 50 |
R2 | 0.948 | 0.982 |
Note: | p<0.1; p<0.05; p<0.01 |
<- lm(y ~ x, data = df)$coefficients[2]
e_hat <- lm(y ~ w, data = df)$coefficients[2]
c_hat <- lm(w ~ x, data = df)$coefficients[2]
d_hat <- e_hat - c_hat * d_hat
b_hat b_hat
## x
## -0.08876039
2.5.2.1 Dual Path Estimator Versus Long Regression
<- function(i) {
sim_run
<- gen_data()
df <- summary(lm(y ~ x + w, data = df))[[4]]
lm_temp <- lm(y ~ x, data = df)$coefficients[2]
e_hat <- lm(y ~ w, data = df)$coefficients[2]
c_hat <- lm(w ~ x, data = df)$coefficients[2]
d_hat <- e_hat - c_hat * d_hat
b_hat data.frame(standard = lm_temp[2],
t_stat = lm_temp[8],
proposed = b_hat)
}
<- lapply(1:100, sim_run)
sim_res <- bind_rows(sim_res)
sim_res <- t(as.matrix(bind_rows(lapply(sim_res, summary))))
b_mat colnames(b_mat) <- c("Standard est", "t-stat", "Proposed est")
print(xtable(b_mat, digits = 3),
type = "html")
Standard est | t-stat | Proposed est | |
---|---|---|---|
Min. | -4.805 | -3.029 | -0.120 |
1st Qu. | -1.691 | -0.773 | -0.029 |
Median | -0.273 | -0.144 | -0.006 |
Mean | -0.077 | -0.046 | -0.002 |
3rd Qu. | 1.389 | 0.741 | 0.031 |
Max. | 6.559 | 2.681 | 0.104 |
hist(sim_res$standard, freq = FALSE,
breaks = seq(-8, 8, by = 2),
xlab = "Estimate of b", main = "")
lines(density(sim_res$standard), type = "l", lwd = 3)
abline(v = c(min(sim_res$proposed),
max(sim_res$proposed)), lty = 2, lwd = 3)
%>%
sim_res ggplot(aes(x = standard, y = ..density..)) +
geom_histogram(breaks = seq(-8, 8, by = 2),
fill = "grey", colour = "black") +
geom_density() +
geom_vline(xintercept = c(min(sim_res$proposed),
max(sim_res$proposed)),
linetype = 2)
2.6 Are Bankers Racist or Greedy?
2.6.1 Boston HDMA Data
<-
hdma_cc %>%
hdma mutate(lwage = if_else(wage > 0, log(wage), NA_real_),
emp = if_else(emp > 1000, NA_real_, emp)) %>%
select(deny, black, lwage, chist, mhist, phist, emp)
<- lm(deny ~ black, data = hdma_cc)
lm1 print(xtable(lm1), type = "html")
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 0.1131 | 0.0069 | 16.30 | 0.0000 |
blackTRUE | 0.1945 | 0.0174 | 11.21 | 0.0000 |
2.6.2 Causal Pathways of Discrimination
2.6.3 Estimating the Direct Effect
<- function(df, y_var, x_var) {
get_direct_effect
<- na.omit(df)
df
<- df %>% select(-all_of(c(x_var, y_var)))
W <- df %>% select(all_of(y_var))
Y <- df %>% select(all_of(x_var))
X
<- cbind(1, as.matrix(X))
X <- cbind(1, as.matrix(W))
W <- as.matrix(Y)
Y
<- solve(t(X) %*% X) %*% t(X) %*% Y
beta_tilde_hat <- solve(t(X) %*% X) %*% t(X) %*% W
Delta_hat <- solve(t(W) %*% W) %*% t(W) %*% Y
gamma_hat <- beta_tilde_hat - Delta_hat %*% gamma_hat
beta_hat tibble(intercept = beta_hat[1], `direct effect` = beta_hat[2])
}
%>%
hdma_cc get_direct_effect(y_var = "deny", x_var = "black") %>%
xtable(digits = 8) %>%
print(include.rownames = FALSE, type = "html")
intercept | direct effect |
---|---|
-0.01797654 | 0.11112066 |
2.6.4 Adding in More Variables
<-
hdma_more %>%
hdma mutate(lwage = if_else(wage > 0, log(wage), NA_real_),
lwage_coap = if_else(wage_coap > 0, log(wage_coap), NA_real_),
lwage_coap2 = if_else(coap, lwage_coap, 0)) %>%
select(deny, black, lwage, chist, mhist, phist, emp,
married, dr, clines, male, suff, assets, lr, pr,
coap, s20, s24a, s27a, s39, s48, s53, s55, s56,
s57, chval, school, bd, mi, old, vr, uria, netw,%>%
lwage_coap2) na.omit()
2.6.5 Bootstrap Dual-Path Estimator in R
set.seed(123456789)
<- 1000
K
<- function(i) {
run_bs_iter %>%
hdma_more slice_sample(prop = 1, replace = TRUE) %>%
get_direct_effect(y_var = "deny", x_var = "black")
}
<- bind_rows(lapply(1:K, run_bs_iter)) sim_res
<- function(df) {
my_summ <- function(x) {
my_summ_fun tibble(Estimate = mean(x),
SD = sd(x),
`2.5%` = quantile(x, 0.025),
`97.5%` = quantile(x, 0.975))
}
bind_rows(lapply(df, my_summ_fun), .id = "variable")
}
%>%
sim_res my_summ() %>%
xtable() %>%
print(include.rownames = FALSE, type = "html")
variable | Estimate | SD | 2.5% | 97.5% |
---|---|---|---|---|
intercept | -0.00 | 0.00 | -0.01 | 0.00 |
direct effect | 0.02 | 0.02 | -0.01 | 0.05 |