Chapter 2 Multiple Regression

library(stargazer)
library(dplyr, warn.conflicts = FALSE)
library(lmwr)
library(xtable)
library(ggplot2)

2.1 Introduction

2.2 Long and Short Regression

2.2.1 Using Short Regression

2.2.2 Independent Explanatory Variables

2.2.3 Dependent Explanatory Variables

2.2.4 Simulation with Multiple Explanatory Variables

N <- 1000
a <- 2
b <- 3
c <- 4
set.seed(123456789)
u_x <- rnorm(N)
get_lms <- function(alpha = 0, return_data = FALSE) {
  x <- (1 - alpha) * runif(N) + alpha * u_x
  w <- (1 - alpha) * runif(N) + alpha * u_x
  if (alpha == 0) u <<- rnorm(N)
  y <- a + b* x + c * w + u
  if (return_data) {
    data.frame(y, x, w)
  } else {
    list(lm(y ~ x), lm(y ~ x + w))
  }
}
set.seed(123456789)
u_x <- rnorm(N)
lms <- lapply(list(0, 0.5, 0.95), get_lms, return_data = FALSE)
library(stargazer)
stargazer(lms, type = "html", keep.stat = c("n", "rsq"), digits = 2)
Dependent variable:
y
(1) (2) (3) (4) (5) (6)
x 3.14*** 2.81*** 6.84*** 2.86*** 7.01*** 0.67
(0.17) (0.11) (0.08) (0.17) (0.03) (1.59)
w 4.05*** 4.16*** 6.35***
(0.11) (0.16) (1.59)
Constant 3.98*** 2.15*** 2.14*** 2.07*** 2.07*** 2.08***
(0.10) (0.08) (0.05) (0.04) (0.03) (0.03)
Observations 1,000 1,000 1,000 1,000 1,000 1,000
R2 0.26 0.68 0.88 0.93 0.98 0.98
Note: p<0.1; p<0.05; p<0.01

2.2.5 Matrix Algebra of Short Regression

set.seed(123456789)
u_x <- rnorm(N)
dfs <- lapply(list(0, 0.5, 0.95), get_lms, return_data = TRUE)
with(dfs[[1]], cov(x, w))
## [1] 0.007019082
with(dfs[[2]], cov(x, w))
## [1] 0.2557656
with(dfs[[2]], t(x) %*% w)
##          [,1]
## [1,] 318.8261

2.3 Collinearity and Multicollinearity

2.3.1 Matrix Algebra of Multicollinearity

2.3.2 Understanding Multicollinearity with R

X2 <- with(dfs[[3]], cbind(1, x, w))
solve(t(X2) %*% X2) %*% t(X2) %*% u
##         [,1]
##    0.0766164
## x -2.3321265
## w  2.3462160
mean(u)
## [1] 0.07635957
with(dfs[[3]], cov(w, u))
## [1] 0.01413717
1/N * t(X2) %*% u
##         [,1]
##   0.07635957
## x 0.01593319
## w 0.01687792
1/det(t(X2) %*% X2)
## [1] 2.610265e-06

2.4 Returns to Schooling

2.4.1 Multiple Regression of Returns to Schooling

2.4.2 NLSM Data

nlsm_mod <-
  nlsm %>%
  filter(!is.na(lwage76)) %>%
  mutate(exp = age76 - ed76 - 6,
         exp2 = exp^2/100)

2.4.3 OLS Estimates of Returns to Schooling

fm <- list()
fm[[1]] <- lm(lwage76 ~ ed76, data = nlsm_mod)
fm[[2]] <- update(fm[[1]], ~ . + exp + exp2)
fm[[3]] <- update(fm[[2]], ~ . + black + reg76r)
fm[[4]] <- update(fm[[3]], ~ . + smsa76r + smsa66r + reg662 + reg663 +
                    reg664 + reg665 + reg666 + reg667 + reg668 + reg669)

stargazer(fm, type = "html", keep.stat = c("n", "rsq"),
          omit = "^(reg66|smsa)")
Dependent variable:
lwage76
(1) (2) (3) (4)
ed76 0.052*** 0.093*** 0.078*** 0.075***
(0.003) (0.004) (0.004) (0.003)
exp 0.090*** 0.085*** 0.085***
(0.007) (0.007) (0.007)
exp2 -0.249*** -0.234*** -0.229***
(0.034) (0.032) (0.032)
black -0.178*** -0.199***
(0.018) (0.018)
reg76r -0.150*** -0.148***
(0.015) (0.026)
Constant 5.571*** 4.469*** 4.796*** 4.621***
(0.039) (0.069) (0.069) (0.074)
Observations 3,010 3,010 3,010 3,010
R2 0.099 0.196 0.265 0.300
Note: p<0.1; p<0.05; p<0.01

2.5 Causal Pathways

2.5.1 Dual-Path Model

2.5.2 Simulation of Dual-Path Model

set.seed(123456789)
N <- 50
a <- 1
b <- 0
c <- 3
d <- 4

gen_data <- function() {
  x <- round(runif(N))
  u_w <- runif(N)
  w <- d * x + u_w
  u <- rnorm(N)
  y <- a + b* x + c * w + u
  data.frame(y, x, w)
}

df <- gen_data()
  
fm <- list()
fm[[1]] <- lm(y ~ x, data = df)
fm[[2]] <- update(fm[[1]], ~ . + w)
stargazer(fm, type = "html", keep.stat = c("n", "rsq"),
          omit = "^(reg66|smsa)")
Dependent variable:
y
(1) (2)
x 12.241*** -5.148***
(0.413) (1.885)
w 4.285***
(0.460)
Constant 2.587*** 0.564*
(0.298) (0.281)
Observations 50 50
R2 0.948 0.982
Note: p<0.1; p<0.05; p<0.01
e_hat <- lm(y ~ x, data = df)$coefficients[2]
c_hat <- lm(y ~ w, data = df)$coefficients[2]
d_hat <- lm(w ~ x, data = df)$coefficients[2]
b_hat <- e_hat - c_hat * d_hat
b_hat
##           x 
## -0.08876039

2.5.2.1 Dual Path Estimator Versus Long Regression

sim_run <- function(i) {

  df <- gen_data()
  lm_temp <- summary(lm(y ~ x + w, data = df))[[4]]
  e_hat <- lm(y ~ x, data = df)$coefficients[2]
  c_hat <- lm(y ~ w, data = df)$coefficients[2]
  d_hat <- lm(w ~ x, data = df)$coefficients[2]
  b_hat <- e_hat - c_hat * d_hat
  data.frame(standard = lm_temp[2], 
             t_stat = lm_temp[8], 
             proposed = b_hat)
}
sim_res <- lapply(1:100, sim_run)
sim_res <- bind_rows(sim_res)
b_mat <- t(as.matrix(bind_rows(lapply(sim_res, summary))))
colnames(b_mat) <- c("Standard est", "t-stat", "Proposed est")
print(xtable(b_mat, digits = 3),
      type = "html")
Standard est t-stat Proposed est
Min. -4.805 -3.029 -0.120
1st Qu. -1.691 -0.773 -0.029
Median -0.273 -0.144 -0.006
Mean -0.077 -0.046 -0.002
3rd Qu. 1.389 0.741 0.031
Max. 6.559 2.681 0.104
hist(sim_res$standard, freq = FALSE,
     breaks = seq(-8, 8, by = 2),
     xlab = "Estimate of b", main = "")
lines(density(sim_res$standard), type = "l", lwd = 3)
abline(v = c(min(sim_res$proposed),
             max(sim_res$proposed)), lty = 2, lwd = 3)

sim_res %>%
  ggplot(aes(x = standard,  y = ..density..)) +
  geom_histogram(breaks = seq(-8, 8, by = 2),
                 fill = "grey", colour = "black") +
  geom_density() +
  geom_vline(xintercept = c(min(sim_res$proposed),
                            max(sim_res$proposed)),
             linetype = 2)

2.6 Are Bankers Racist or Greedy?

2.6.1 Boston HDMA Data

hdma_cc <- 
  hdma %>%
  mutate(lwage = if_else(wage > 0, log(wage), NA_real_),
         emp = if_else(emp > 1000, NA_real_, emp)) %>%
  select(deny, black, lwage, chist, mhist, phist, emp) 

lm1 <- lm(deny ~  black, data = hdma_cc) 
print(xtable(lm1), type = "html")
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.1131 0.0069 16.30 0.0000
blackTRUE 0.1945 0.0174 11.21 0.0000

2.6.2 Causal Pathways of Discrimination

2.6.3 Estimating the Direct Effect

get_direct_effect <- function(df, y_var, x_var) {
  
  df <- na.omit(df)
  
  W <- df %>% select(-all_of(c(x_var, y_var)))
  Y <- df %>% select(all_of(y_var))
  X <- df %>% select(all_of(x_var))
  
  X <- cbind(1, as.matrix(X))
  W <- cbind(1, as.matrix(W))
  Y <- as.matrix(Y)
  
  beta_tilde_hat <- solve(t(X) %*% X) %*% t(X) %*% Y
  Delta_hat <- solve(t(X) %*% X) %*% t(X) %*% W
  gamma_hat <- solve(t(W) %*% W) %*% t(W) %*% Y
  beta_hat <- beta_tilde_hat - Delta_hat %*% gamma_hat
  tibble(intercept = beta_hat[1], `direct effect` = beta_hat[2])
}
hdma_cc %>%
  get_direct_effect(y_var = "deny", x_var = "black") %>%
  xtable(digits = 8) %>%
  print(include.rownames = FALSE, type = "html")
intercept direct effect
-0.01797654 0.11112066

2.6.4 Adding in More Variables

hdma_more <- 
  hdma %>%
  mutate(lwage = if_else(wage > 0, log(wage), NA_real_),
         lwage_coap = if_else(wage_coap > 0, log(wage_coap), NA_real_),
         lwage_coap2 = if_else(coap, lwage_coap, 0)) %>%
  select(deny, black, lwage, chist, mhist, phist, emp,
         married, dr, clines, male, suff, assets, lr, pr,
         coap, s20, s24a, s27a, s39, s48, s53, s55, s56,
         s57, chval, school, bd, mi, old, vr, uria, netw,
         lwage_coap2) %>%
  na.omit()

2.6.5 Bootstrap Dual-Path Estimator in R

set.seed(123456789)
K <- 1000

run_bs_iter <- function(i) {
  hdma_more %>% 
    slice_sample(prop = 1, replace = TRUE) %>%
    get_direct_effect(y_var = "deny", x_var = "black")
}

sim_res <- bind_rows(lapply(1:K, run_bs_iter))
my_summ <- function(df) {
  my_summ_fun <- function(x) {
    tibble(Estimate = mean(x), 
           SD = sd(x),
           `2.5%` = quantile(x, 0.025),
           `97.5%` = quantile(x, 0.975))
  }

  bind_rows(lapply(df, my_summ_fun), .id = "variable")
}
sim_res %>% 
  my_summ() %>% 
  xtable() %>%
  print(include.rownames = FALSE, type = "html")
variable Estimate SD 2.5% 97.5%
intercept -0.00 0.00 -0.01 0.00
direct effect 0.02 0.02 -0.01 0.05