Chapter 10 Panel Data
10.1 First differences
10.1.1 First difference model
10.1.2 Simulated panel data
library(dplyr, warn.conflicts = FALSE)
library(lmwr)
library(tidyr)
library(stargazer)
library(lfe)
set.seed(123456789)
<- 1000
N <- 2
T <- -2
a <- 3
b <- -4
d <- c(5, 6) u
<- runif(N)
x1 <- x1 + runif(N)
x2 <- as.matrix(rbind(x1, x2))
x <- matrix(rnorm(N * T), nrow = T)
e <- a + b * x + d * u + e y
<- y[2, ] - y[1, ]
diffy <- x[2, ] - x[1, ]
diffx <- lm(diffy ~ diffx) lm1
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | -3.77 | 0.09 | -41.84 | 0.00 |
diffx | 2.75 | 0.15 | 17.83 | 0.00 |
If we wanted data in a tidy data frame, we could use functions from tidyr
to rearrange the matrices and vectors above into a tidy data frame with and id
column for each individual and a time
column for periods \(1\) and \(2\).
However, as the following code suggests, while the code works, this approach is quite unwieldy.
<-
df tibble(id = 1:(dim(y)[2]), y_1 = y[1,], y_2 = y[2, ],
x_1 = x[1, ], x_2 = x[2,]) %>%
pivot_longer(cols = y_1:x_2) %>%
separate(name, into = c("var", "time"), sep = "_",
convert = TRUE) %>%
pivot_wider(id_cols = c("id", "time"), names_from = "var",
values_from = "value")
df
## # A tibble: 2,000 x 4
## id time y x
## <int> <int> <dbl> <dbl>
## 1 1 1 -20.5 0.693
## 2 1 2 -22.4 1.30
## 3 2 1 -21.0 0.673
## 4 2 2 -22.2 1.41
## 5 3 1 -19.3 0.654
## 6 3 2 -22.2 1.31
## 7 4 1 -19.7 0.719
## 8 4 2 -20.6 1.42
## 9 5 1 -21.0 0.922
## 10 5 2 -19.4 1.83
## # … with 1,990 more rows
A better approach is probably to build up the data using a “tidy” approach from the outset.
Note that the arrange
below is only needed to ensure that the results line up with those above and could be omitted in general.
set.seed(123456789)
<- tibble(id = 1:N, time = 1L, x = runif(N), u = 5)
df_pre <- tibble(id = 1:N, time = 2L, x = df_pre$x + runif(N), u = 6)
df_post
<-
df_all bind_rows(df_pre, df_post) %>%
arrange(id, time) %>%
mutate(e = rnorm(N * T)) %>%
mutate(y = a + b * x + d * u + e) %>%
select(-u, -e)
df_all
## # A tibble: 2,000 x 4
## id time x y
## <int> <int> <dbl> <dbl>
## 1 1 1 0.693 -20.5
## 2 1 2 1.30 -22.4
## 3 2 1 0.673 -21.0
## 4 2 2 1.41 -22.2
## 5 3 1 0.654 -19.3
## 6 3 2 1.31 -22.2
## 7 4 1 0.719 -19.7
## 8 4 2 1.42 -20.6
## 9 5 1 0.922 -21.0
## 10 5 2 1.83 -19.4
## # … with 1,990 more rows
10.1.3 OLS estimation of first differences
%>%
df group_by(id) %>%
arrange(time) %>%
mutate(diffy = y - lag(y),
diffx = x - lag(x)) %>%
lm(diffy ~ diffx, data = .)
##
## Call:
## lm(formula = diffy ~ diffx, data = .)
##
## Coefficients:
## (Intercept) diffx
## -3.771 2.748
10.2 Difference in difference
10.2.1 Difference-in-difference estimator
10.2.2 Difference-in-difference estimator in R
The following version of the f_did
function substracts four lines from that in the book.
<- function(y, treat) {
f_did <- y[1, ]
y1 <- y[2, ]
y2 <- matrix(NA, 3, 3)
did 1, 1] <- mean(y2[treat==1])
did[2, 1] <- mean(y1[treat==1])
did[1, 2] <- mean(y2[treat==0])
did[2, 2] <- mean(y1[treat==0])
did[3, ] <- did[1, ] - did[2, ]
did[3] <- did[ , 1] - did[ , 2]
did[ , row.names(did) <- c("Period 2", "Period 1", "Diff")
colnames(did) <- c("Treated", "Not Treated", "Diff")
return(did)
}
set.seed(123456789)
<- runif(N) < 0.5
treat <- x1 + treat
x2 <- rbind(x1, x2)
x <- a + b * x + d * u + e
y <- f_did(y, treat) did1
Treated | Not Treated | Diff | |
---|---|---|---|
Period 2 | -22.16 | -23.66 | 1.50 |
Period 1 | -21.25 | -19.77 | -1.49 |
Diff | -0.90 | -3.89 | 2.99 |
10.3 Minimum wage increase in New Jersey
Here I was too lazy to do the “base R” version. Note that Adams (2021) states “Card and Krueger (1994) present results on a measure called full-time equivalence. It is unclear how that measure is calculated.” Actually if you go to the website for the data, you can see from the provided SAS code that it is calculated as below. I don’t quite reproduce the results in Table 3 of the paper, but I get very close.
10.3.1 Data from Card and Krueger (1994)
We used cardkrueger
from lmwr
.
<-
nj_min_pre %>%
cardkrueger select(sheet, empft, emppt, state, nmgrs) %>%
rename(ft = empft, pt = emppt) %>%
mutate(time = 1)
<-
nj_min_post %>%
cardkrueger select(sheet, empft2, emppt2, state, nmgrs2) %>%
rename(ft = empft2, pt = emppt2, nmgrs = nmgrs2) %>%
mutate(time = 2)
<-
nj_min bind_rows(nj_min_pre, nj_min_post) %>%
mutate(total = ft + pt,
fte = ft + pt * 0.5 + nmgrs,
post = time == 2,
treat = state == 1) %>%
group_by(sheet) %>%
mutate(balanced = sum(!is.na(total)) == 2) %>%
ungroup()
10.3.2 Difference-in-Difference Estimates
<- list(lm(total ~ post * treat, data = nj_min),
fms lm(total ~ post * treat, data = nj_min, subset = balanced),
lm(fte ~ post * treat, data = nj_min),
lm(fte ~ post * treat, data = nj_min, subset = balanced))
Dependent variable: | ||||
total | fte | |||
(1) | (2) | (3) | (4) | |
post | -2.173 | -2.237 | -2.166 | -2.239 |
(1.952) | (1.986) | (1.516) | (1.542) | |
treat | -3.286** | -3.309** | -2.892** | -2.952** |
(1.531) | (1.565) | (1.194) | (1.219) | |
postTRUE:treat | 2.487 | 2.440 | 2.754 | 2.759 |
(2.173) | (2.213) | (1.688) | (1.719) | |
Constant | 29.692*** | 29.711*** | 23.331*** | 23.380*** |
(1.376) | (1.404) | (1.072) | (1.094) | |
Observations | 801 | 782 | 794 | 775 |
R2 | 0.006 | 0.006 | 0.007 | 0.008 |
Note: | p<0.1; p<0.05; p<0.01 |
10.4 Fixed Effects
10.4.1 Fixed-Effects Estimator
10.4.2 Nuisance Parameter
10.4.3 Adjusted Fixed-Effects Estimator
10.4.4 Two-Step Fixed-Effects Estimator
10.4.5 Fixed Effects Estimator in R
Note that the code in the book incorrectly describes XT
and XC
(“in each row” and “in each column” are flipped).
<- function(Y, X = NULL, cross = TRUE) {
f_fe <- as.matrix(Y)
Y <- dim(Y)[1]
T <- dim(Y)[2]
N
# Creates a T x N matrix with numbers 1 to T in each column
<- matrix(rep(c(1:T), N), nrow = T)
XT <- as.vector(Y)
y <- as.vector(XT)
t
# Set up for different cases
if (cross) {
# Create cross-section dummies
# Creates a T x N matrix with 1 to N in each row
<- t(matrix(rep(c(1:N), T), nrow = N))
XC <- as.vector(XC)
c
}
if (!is.null(X)) {
# Create treatment variable
<- as.matrix(X)
X <- as.vector(X)
treat
}
# Estimator
if (cross & !is.null(X)) {
# Standard case
<- lm(y ~ treat + as.factor(t) + as.factor(c))
lm1 else if (!is.null(X)) {
} # No cross-section
<- lm(y ~ treat + as.factor(t))
lm1 else {
} # No treatment
<- lm(y ~ as.factor(t))
lm1
}return(lm1)
}
The simulated panel data set has 100 individuals observed over 10 time periods.
set.seed(123456789)
<- 100
N <- 10
T <- runif(N)
alpha <- runif(T)
gamma <- 3
beta <- matrix(rnorm(N * T), nrow = T)
epsilon <- runif(N) < 0.5
treat <- t(matrix(rep(alpha, T), nrow = N)) + gamma + epsilon
y 1, ] <- y[1, ] + beta * treat
y[<- matrix(0, T, N)
treat1 1, ] <- treat treat1[
# Standard estimator
<- f_fe(y, treat1)
lm1
# No invidividual fixed effects
<- f_fe(y, treat1, cross = FALSE)
lm2
# Adjusted estimator
<- y[2:T, ]
y0 <- colMeans(y0) # Calculate alpha
alpha_hat <- y - t(matrix(rep(alpha_hat, T), nrow = N))
y2 <- f_fe(y2, treat1, cross = FALSE)
lm3
# Two-step estimator
<- f_fe(y0, cross = FALSE)
lm4 <- matrix(lm4$residuals, nrow = T - 1)
y0_res <- colMeans(y0_res) # Calculate alpha
alpha_hat <- y - t(matrix(rep(alpha_hat, T), nrow = N)) # Adjust outcome
y3 <- f_fe(y3, treat1, cross = FALSE) lm5
stargazer(list(lm1, lm2, lm3, lm5),
keep.stat = c("n", "rsq"), keep = "treat", type = "html")
Dependent variable: | ||||
y | ||||
(1) | (2) | (3) | (4) | |
treat | 2.998*** | 3.060*** | 2.998*** | 2.998*** |
(0.215) | (0.214) | (0.195) | (0.195) | |
Observations | 1,000 | 1,000 | 1,000 | 1,000 |
R2 | 0.458 | 0.340 | 0.378 | 0.378 |
Note: | p<0.1; p<0.05; p<0.01 |
Now, rearrange the data into a “tidy” form.
<-
df tibble(y = as.vector(y),
treat = as.vector(treat1),
id = as.vector(t(matrix(rep(c(1:N), T), nrow = N))),
t = as.vector(rep(c(T:1), N)))
%>% print(n = 15) df
## # A tibble: 1,000 x 4
## y treat id t
## <dbl> <dbl> <int> <int>
## 1 -0.516 0 1 10
## 2 1.11 0 1 9
## 3 2.73 0 1 8
## 4 -0.916 0 1 7
## 5 3.10 0 1 6
## 6 0.562 0 1 5
## 7 1.29 0 1 4
## 8 0.720 0 1 3
## 9 1.06 0 1 2
## 10 0.210 0 1 1
## 11 1.87 0 2 10
## 12 2.34 0 2 9
## 13 3.24 0 2 8
## 14 2.14 0 2 7
## 15 1.12 0 2 6
## # … with 985 more rows
And use the felm
function from the lfe
package to reproduce the regression results above.
<- felm(y ~ treat | id + t, data = df)
felm1 <- felm(y ~ treat | t, data = df)
felm2
<-
df_alpha_hat %>%
df filter(t < 10) %>%
mutate(y0_res = residuals(lm(y ~ as.factor(t), data = .))) %>%
group_by(id) %>%
summarize(alpha_hat = mean(y0_res),
mean_y = mean(y), .groups = "drop")
<-
felm3 %>%
df inner_join(df_alpha_hat, by = "id") %>%
mutate(y = y - mean_y) %>%
felm(y ~ treat | t, data = .)
<-
felm5 %>%
df inner_join(df_alpha_hat, by = "id") %>%
mutate(y = y - alpha_hat) %>%
felm(y ~ treat | t, data = .)
stargazer(list(felm1, felm2, felm3, felm5),
keep.stat = c("n", "rsq"), keep = "treat", type = "html")
Dependent variable: | ||||
y | ||||
(1) | (2) | (3) | (4) | |
treat | 2.998*** | 3.060*** | 2.998*** | 2.998*** |
(0.215) | (0.214) | (0.195) | (0.195) | |
Observations | 1,000 | 1,000 | 1,000 | 1,000 |
R2 | 0.458 | 0.340 | 0.378 | 0.378 |
Note: | p<0.1; p<0.05; p<0.01 |