Chapter 1 Ordinary Least Squares
library(dplyr, warn.conflicts = FALSE)
library(ggplot2)
library(xtable)
library(lmwr)
1.1 Introduction
1.2 Estimating the Causal Effect
1.2.1 Graphing the Causal Effect
1.2.2 A Linear Causal Model
1.2.3 Simulation of the Causal Effect
set.seed(123456789)
<- 100
N <- 2
a <- 3
b <- 3
c <- 4
d
<- function() {
gen_data <- runif(N)
x <- rnorm(N)
u <- a + b* x + u
y data.frame(y, x, u)
}
<- gen_data() df
1.2.4 Averaging to Estimate the Causal Effect
%>% filter(x > 0.95) %>% summarize(mean(y)) %>% pull() -
df %>% filter(x < 0.05) %>% summarize(mean(y)) %>% pull()) (df
## [1] 2.721387
%>%
df ggplot() +
geom_point(aes(x, y)) +
geom_abline(intercept = 2, slope = 3)
### Assumptions of the OLS Model
1.3 Matrix Algebra of the OLS Model
1.3.1 Standard Algebra of the OLS Model
1.3.2 Algebraic OLS Estimator in R
1.3.3 Using Matrices
1.3.4 Multiplying Matrices in R
Oops!! Seems we have switched to =
for assignment instead of <-
in the book here.
<- df$x[1:5]
x1 <- cbind(1, x1)
X1 as.vector(X1 %*% c(2, 3))
## [1] 4.079527 4.018643 3.961705 4.156967 4.764634
Compare to the true values:
$y[1:5] df
## [1] 4.224902 4.219999 5.374130 3.378327 5.541021
1.3.5 Matrix Estimator of OLS
1.3.6 Matrix Estimator of OLS in R
<- cbind(1, df$x) X
<- matrix(c(1:6), nrow=3)
A A
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
t(A)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
t(A) %*% A
## [,1] [,2]
## [1,] 14 32
## [2,] 32 77
t(X) %*% X
## [,1] [,2]
## [1,] 100.00000 49.56876
## [2,] 49.56876 32.99536
solve(t(X) %*% X)
## [,1] [,2]
## [1,] 0.03916481 -0.05883708
## [2,] -0.05883708 0.11869792
<- solve(t(X) %*% X) %*% t(X) %*% df$y
beta_hat beta_hat
## [,1]
## [1,] 2.181997
## [2,] 3.092764
solve(t(X) %*% X) %*% t(X) %*% df$u
## [,1]
## [1,] 0.1819969
## [2,] 0.0927640
1.4 Least Squares Method for OLS
1.4.1 Moment Estimation
1.4.2 Algebra of Least Squares
1.4.3 Estimating Least Squares in R
optimize(function(b) sum((df$y - 2 - b*df$x)^2), c(10, -10))$minimum
## [1] 3.366177
sum(df$x * df$y) - 2 * sum(df$x))/sum((df$x)^2) (
## [1] 3.366177
1.4.4 The lm()
Function
<- lm(y ~ x, data = df)
lm1 length(lm1)
## [1] 12
$coefficients lm1
## (Intercept) x
## 2.181997 3.092764
1.5 Measuring Uncertainty
1.5.1 Data Simulations
<- 1000
K <- lapply(1:K, function(i) lm(y ~ x, data = gen_data())$coefficients)
sim_res <- bind_rows(sim_res)
sim_res names(sim_res) <- c("Estimated a", "Estimated b")
<- t(as.matrix(bind_rows(lapply(sim_res, summary))))
tab_res colnames(tab_res) <- c("Estimated a", "Estimated b")
print(xtable(tab_res, digits = 3),
type = "html")
Estimated a | Estimated b | |
---|---|---|
Min. | 1.389 | 1.754 |
1st Qu. | 1.874 | 2.758 |
Median | 2.014 | 2.979 |
Mean | 2.012 | 2.984 |
3rd Qu. | 2.155 | 3.206 |
Max. | 2.772 | 4.466 |
1.5.2 Introduction to the Bootstrap
1.5.3 Bootstrap in R
set.seed(123456789)
<- 1000
K <- lapply(1:K, function(i) lm(y ~ x, data = df[round(runif(N, min=1, max=N)),])$coefficients)
tab_res
<- function(x) {
my_summ tibble(mean = mean(x), sd = sd(x),
p2.5 = quantile(x, 0.025),
p97.5 = quantile(x, 0.975))
}<- bind_rows(tab_res)
tab_res names(tab_res) <- c("Estimated a", "Estimated b")
<- bind_rows(lapply(tab_res, my_summ), .id = "variable") tab_res
print(xtable(tab_res),
include.rownames = FALSE,
type = "html")
variable | mean | sd | p2.5 | p97.5 |
---|---|---|---|---|
Estimated a | 2.21 | 0.21 | 1.79 | 2.65 |
Estimated b | 3.06 | 0.36 | 2.35 | 3.74 |
print(xtable(summary(lm1), digits = 3),
type = "html")
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 2.182 | 0.182 | 11.957 | 0.000 |
x | 3.093 | 0.318 | 9.736 | 0.000 |
1.6 Returns to Schooling
1.6.1 A Linear Model of Returns to Schooling
1.6.2 NLSM Data
1.6.3 Plotting Returns to Schooling
<- lm(lwage76 ~ ed76, data = nlsm) lm1
1.6.4 Plotting Returns to Schooling
ggplot(data = nlsm) +
geom_point(aes(x = ed76, y = lwage76)) +
geom_abline(intercept = lm1$coefficients[1], slope = lm1$coefficients[2])
## Warning: Removed 603 rows containing missing values (geom_point).
exp(log(mean(nlsm$wage76, na.rm = TRUE)) +
$coefficients[2])/mean(nlsm$wage76, na.rm = TRUE) lm1
## ed76
## 1.053475