Chapter 1 Ordinary Least Squares

library(dplyr, warn.conflicts = FALSE)
library(ggplot2)
library(xtable)
library(lmwr)

1.1 Introduction

1.2 Estimating the Causal Effect

1.2.1 Graphing the Causal Effect

1.2.2 A Linear Causal Model

1.2.3 Simulation of the Causal Effect

set.seed(123456789)
N <- 100
a <- 2
b <- 3
c <- 3
d <- 4

gen_data <- function() {
  x <- runif(N)
  u <- rnorm(N)
  y <- a + b* x + u
  data.frame(y, x, u)
}

df <- gen_data()

1.2.4 Averaging to Estimate the Causal Effect

df %>% filter(x > 0.95) %>% summarize(mean(y)) %>% pull() -
  (df %>% filter(x < 0.05) %>% summarize(mean(y)) %>% pull())
## [1] 2.721387
df %>%
  ggplot() + 
  geom_point(aes(x, y)) +
  geom_abline(intercept = 2, slope = 3)

### Assumptions of the OLS Model

1.3 Matrix Algebra of the OLS Model

1.3.1 Standard Algebra of the OLS Model

1.3.2 Algebraic OLS Estimator in R

1.3.3 Using Matrices

1.3.4 Multiplying Matrices in R

Oops!! Seems we have switched to = for assignment instead of <- in the book here.

x1 <- df$x[1:5]
X1 <- cbind(1, x1)
as.vector(X1 %*% c(2, 3))
## [1] 4.079527 4.018643 3.961705 4.156967 4.764634

Compare to the true values:

df$y[1:5]
## [1] 4.224902 4.219999 5.374130 3.378327 5.541021

1.3.5 Matrix Estimator of OLS

1.3.6 Matrix Estimator of OLS in R

X <- cbind(1, df$x)
A <- matrix(c(1:6), nrow=3)
A
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
t(A)
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
t(A) %*% A
##      [,1] [,2]
## [1,]   14   32
## [2,]   32   77
t(X) %*% X
##           [,1]     [,2]
## [1,] 100.00000 49.56876
## [2,]  49.56876 32.99536
solve(t(X) %*% X)
##             [,1]        [,2]
## [1,]  0.03916481 -0.05883708
## [2,] -0.05883708  0.11869792
beta_hat <- solve(t(X) %*% X) %*% t(X) %*% df$y
beta_hat
##          [,1]
## [1,] 2.181997
## [2,] 3.092764
solve(t(X) %*% X) %*% t(X) %*% df$u
##           [,1]
## [1,] 0.1819969
## [2,] 0.0927640

1.4 Least Squares Method for OLS

1.4.1 Moment Estimation

1.4.2 Algebra of Least Squares

1.4.3 Estimating Least Squares in R

optimize(function(b) sum((df$y - 2 - b*df$x)^2), c(10, -10))$minimum
## [1] 3.366177
(sum(df$x * df$y) - 2 * sum(df$x))/sum((df$x)^2)
## [1] 3.366177

1.4.4 The lm() Function

lm1 <- lm(y ~ x, data = df)
length(lm1)
## [1] 12
lm1$coefficients
## (Intercept)           x 
##    2.181997    3.092764

1.5 Measuring Uncertainty

1.5.1 Data Simulations

K <- 1000
sim_res <- lapply(1:K, function(i) lm(y ~ x, data = gen_data())$coefficients)
sim_res <- bind_rows(sim_res)
names(sim_res) <- c("Estimated a", "Estimated b")
tab_res <- t(as.matrix(bind_rows(lapply(sim_res, summary))))
colnames(tab_res) <- c("Estimated a", "Estimated b")

print(xtable(tab_res, digits = 3),
      type = "html")
Estimated a Estimated b
Min. 1.389 1.754
1st Qu. 1.874 2.758
Median 2.014 2.979
Mean 2.012 2.984
3rd Qu. 2.155 3.206
Max. 2.772 4.466

1.5.2 Introduction to the Bootstrap

1.5.3 Bootstrap in R

set.seed(123456789)
K <- 1000
tab_res <- lapply(1:K, function(i) lm(y ~ x, data = df[round(runif(N, min=1, max=N)),])$coefficients)

my_summ <- function(x) {
  tibble(mean = mean(x), sd = sd(x),
         p2.5 = quantile(x, 0.025),
         p97.5 = quantile(x, 0.975))
}
tab_res <- bind_rows(tab_res)
names(tab_res) <- c("Estimated a", "Estimated b")
tab_res <- bind_rows(lapply(tab_res, my_summ), .id = "variable")
print(xtable(tab_res),
      include.rownames = FALSE,
      type = "html")
variable mean sd p2.5 p97.5
Estimated a 2.21 0.21 1.79 2.65
Estimated b 3.06 0.36 2.35 3.74
print(xtable(summary(lm1), digits = 3),
      type = "html")
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.182 0.182 11.957 0.000
x 3.093 0.318 9.736 0.000

1.6 Returns to Schooling

1.6.1 A Linear Model of Returns to Schooling

1.6.2 NLSM Data

1.6.3 Plotting Returns to Schooling

lm1 <- lm(lwage76 ~ ed76, data = nlsm)

1.6.4 Plotting Returns to Schooling

ggplot(data = nlsm) + 
  geom_point(aes(x = ed76, y = lwage76)) +
  geom_abline(intercept = lm1$coefficients[1], slope = lm1$coefficients[2])
## Warning: Removed 603 rows containing missing values (geom_point).

exp(log(mean(nlsm$wage76, na.rm = TRUE)) + 
      lm1$coefficients[2])/mean(nlsm$wage76, na.rm = TRUE)
##     ed76 
## 1.053475