# Chapter 10 Panel Data

## 10.1 First differences

### 10.1.2 Simulated panel data

library(dplyr, warn.conflicts = FALSE)
library(lmwr)
library(tidyr)
library(stargazer)
library(lfe) 
set.seed(123456789)
N <- 1000
T <- 2
a <- -2
b <- 3
d <- -4
u <- c(5, 6)
x1 <- runif(N)
x2 <- x1 + runif(N)
x <- as.matrix(rbind(x1, x2))
e <- matrix(rnorm(N * T), nrow = T)
y <- a + b * x + d * u + e
diffy <- y[2, ] - y[1, ]
diffx <- x[2, ] - x[1, ]
lm1 <- lm(diffy ~ diffx)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.77 0.09 -41.84 0.00
diffx 2.75 0.15 17.83 0.00

If we wanted data in a tidy data frame, we could use functions from tidyr to rearrange the matrices and vectors above into a tidy data frame with and id column for each individual and a time column for periods $$1$$ and $$2$$. However, as the following code suggests, while the code works, this approach is quite unwieldy.

df <-
tibble(id = 1:(dim(y)), y_1 = y[1,], y_2 = y[2, ],
x_1 = x[1, ], x_2 = x[2,]) %>%
pivot_longer(cols = y_1:x_2) %>%
separate(name, into = c("var", "time"), sep = "_",
convert = TRUE) %>%
pivot_wider(id_cols = c("id", "time"), names_from = "var",
values_from = "value")

df
## # A tibble: 2,000 x 4
##       id  time     y     x
##    <int> <int> <dbl> <dbl>
##  1     1     1 -20.5 0.693
##  2     1     2 -22.4 1.30
##  3     2     1 -21.0 0.673
##  4     2     2 -22.2 1.41
##  5     3     1 -19.3 0.654
##  6     3     2 -22.2 1.31
##  7     4     1 -19.7 0.719
##  8     4     2 -20.6 1.42
##  9     5     1 -21.0 0.922
## 10     5     2 -19.4 1.83
## # … with 1,990 more rows

A better approach is probably to build up the data using a “tidy” approach from the outset. Note that the arrange below is only needed to ensure that the results line up with those above and could be omitted in general.

set.seed(123456789)

df_pre <- tibble(id = 1:N, time = 1L, x = runif(N), u = 5)
df_post <- tibble(id = 1:N, time = 2L, x = df_pre$x + runif(N), u = 6) df_all <- bind_rows(df_pre, df_post) %>% arrange(id, time) %>% mutate(e = rnorm(N * T)) %>% mutate(y = a + b * x + d * u + e) %>% select(-u, -e) df_all ## # A tibble: 2,000 x 4 ## id time x y ## <int> <int> <dbl> <dbl> ## 1 1 1 0.693 -20.5 ## 2 1 2 1.30 -22.4 ## 3 2 1 0.673 -21.0 ## 4 2 2 1.41 -22.2 ## 5 3 1 0.654 -19.3 ## 6 3 2 1.31 -22.2 ## 7 4 1 0.719 -19.7 ## 8 4 2 1.42 -20.6 ## 9 5 1 0.922 -21.0 ## 10 5 2 1.83 -19.4 ## # … with 1,990 more rows ### 10.1.3 OLS estimation of first differences df %>% group_by(id) %>% arrange(time) %>% mutate(diffy = y - lag(y), diffx = x - lag(x)) %>% lm(diffy ~ diffx, data = .) ## ## Call: ## lm(formula = diffy ~ diffx, data = .) ## ## Coefficients: ## (Intercept) diffx ## -3.771 2.748 ## 10.2 Difference in difference ### 10.2.1 Difference-in-difference estimator ### 10.2.2 Difference-in-difference estimator in R The following version of the f_did function substracts four lines from that in the book. f_did <- function(y, treat) { y1 <- y[1, ] y2 <- y[2, ] did <- matrix(NA, 3, 3) did[1, 1] <- mean(y2[treat==1]) did[2, 1] <- mean(y1[treat==1]) did[1, 2] <- mean(y2[treat==0]) did[2, 2] <- mean(y1[treat==0]) did[3, ] <- did[1, ] - did[2, ] did[ , 3] <- did[ , 1] - did[ , 2] row.names(did) <- c("Period 2", "Period 1", "Diff") colnames(did) <- c("Treated", "Not Treated", "Diff") return(did) } set.seed(123456789) treat <- runif(N) < 0.5 x2 <- x1 + treat x <- rbind(x1, x2) y <- a + b * x + d * u + e did1 <- f_did(y, treat) Treated Not Treated Diff Period 2 -22.16 -23.66 1.50 Period 1 -21.25 -19.77 -1.49 Diff -0.90 -3.89 2.99 ## 10.3 Minimum wage increase in New Jersey Here I was too lazy to do the “base R” version. Note that Adams (2021) states “Card and Krueger (1994) present results on a measure called full-time equivalence. It is unclear how that measure is calculated.” Actually if you go to the website for the data, you can see from the provided SAS code that it is calculated as below. I don’t quite reproduce the results in Table 3 of the paper, but I get very close. ### 10.3.1 Data from Card and Krueger (1994) We used cardkrueger from lmwr. nj_min_pre <- cardkrueger %>% select(sheet, empft, emppt, state, nmgrs) %>% rename(ft = empft, pt = emppt) %>% mutate(time = 1) nj_min_post <- cardkrueger %>% select(sheet, empft2, emppt2, state, nmgrs2) %>% rename(ft = empft2, pt = emppt2, nmgrs = nmgrs2) %>% mutate(time = 2) nj_min <- bind_rows(nj_min_pre, nj_min_post) %>% mutate(total = ft + pt, fte = ft + pt * 0.5 + nmgrs, post = time == 2, treat = state == 1) %>% group_by(sheet) %>% mutate(balanced = sum(!is.na(total)) == 2) %>% ungroup()  ### 10.3.2 Difference-in-Difference Estimates fms <- list(lm(total ~ post * treat, data = nj_min), lm(total ~ post * treat, data = nj_min, subset = balanced), lm(fte ~ post * treat, data = nj_min), lm(fte ~ post * treat, data = nj_min, subset = balanced))  Dependent variable: total fte (1) (2) (3) (4) post -2.173 -2.237 -2.166 -2.239 (1.952) (1.986) (1.516) (1.542) treat -3.286** -3.309** -2.892** -2.952** (1.531) (1.565) (1.194) (1.219) postTRUE:treat 2.487 2.440 2.754 2.759 (2.173) (2.213) (1.688) (1.719) Constant 29.692*** 29.711*** 23.331*** 23.380*** (1.376) (1.404) (1.072) (1.094) Observations 801 782 794 775 R2 0.006 0.006 0.007 0.008 Note: p<0.1; p<0.05; p<0.01 ## 10.4 Fixed Effects ### 10.4.1 Fixed-Effects Estimator ### 10.4.2 Nuisance Parameter ### 10.4.3 Adjusted Fixed-Effects Estimator ### 10.4.4 Two-Step Fixed-Effects Estimator ### 10.4.5 Fixed Effects Estimator in R Note that the code in the book incorrectly describes XT and XC (“in each row” and “in each column” are flipped). f_fe <- function(Y, X = NULL, cross = TRUE) { Y <- as.matrix(Y) T <- dim(Y) N <- dim(Y) # Creates a T x N matrix with numbers 1 to T in each column XT <- matrix(rep(c(1:T), N), nrow = T) y <- as.vector(Y) t <- as.vector(XT) # Set up for different cases if (cross) { # Create cross-section dummies # Creates a T x N matrix with 1 to N in each row XC <- t(matrix(rep(c(1:N), T), nrow = N)) c <- as.vector(XC) } if (!is.null(X)) { # Create treatment variable X <- as.matrix(X) treat <- as.vector(X) } # Estimator if (cross & !is.null(X)) { # Standard case lm1 <- lm(y ~ treat + as.factor(t) + as.factor(c)) } else if (!is.null(X)) { # No cross-section lm1 <- lm(y ~ treat + as.factor(t)) } else { # No treatment lm1 <- lm(y ~ as.factor(t)) } return(lm1) } The simulated panel data set has 100 individuals observed over 10 time periods. set.seed(123456789) N <- 100 T <- 10 alpha <- runif(N) gamma <- runif(T) beta <- 3 epsilon <- matrix(rnorm(N * T), nrow = T) treat <- runif(N) < 0.5 y <- t(matrix(rep(alpha, T), nrow = N)) + gamma + epsilon y[1, ] <- y[1, ] + beta * treat treat1 <- matrix(0, T, N) treat1[1, ] <- treat # Standard estimator lm1 <- f_fe(y, treat1) # No invidividual fixed effects lm2 <- f_fe(y, treat1, cross = FALSE) # Adjusted estimator y0 <- y[2:T, ] alpha_hat <- colMeans(y0) # Calculate alpha y2 <- y - t(matrix(rep(alpha_hat, T), nrow = N)) lm3 <- f_fe(y2, treat1, cross = FALSE) # Two-step estimator lm4 <- f_fe(y0, cross = FALSE) y0_res <- matrix(lm4$residuals, nrow = T - 1)
alpha_hat <- colMeans(y0_res) # Calculate alpha
y3 <- y - t(matrix(rep(alpha_hat, T), nrow = N)) # Adjust outcome
lm5 <- f_fe(y3, treat1, cross = FALSE)
stargazer(list(lm1, lm2, lm3, lm5),
keep.stat = c("n", "rsq"), keep = "treat", type = "html")
 Dependent variable: y (1) (2) (3) (4) treat 2.998*** 3.060*** 2.998*** 2.998*** (0.215) (0.214) (0.195) (0.195) Observations 1,000 1,000 1,000 1,000 R2 0.458 0.340 0.378 0.378 Note: p<0.1; p<0.05; p<0.01

Now, rearrange the data into a “tidy” form.

df <-
tibble(y = as.vector(y),
treat = as.vector(treat1),
id = as.vector(t(matrix(rep(c(1:N), T), nrow = N))),
t = as.vector(rep(c(T:1), N)))

df %>% print(n = 15)
## # A tibble: 1,000 x 4
##         y treat    id     t
##     <dbl> <dbl> <int> <int>
##  1 -0.516     0     1    10
##  2  1.11      0     1     9
##  3  2.73      0     1     8
##  4 -0.916     0     1     7
##  5  3.10      0     1     6
##  6  0.562     0     1     5
##  7  1.29      0     1     4
##  8  0.720     0     1     3
##  9  1.06      0     1     2
## 10  0.210     0     1     1
## 11  1.87      0     2    10
## 12  2.34      0     2     9
## 13  3.24      0     2     8
## 14  2.14      0     2     7
## 15  1.12      0     2     6
## # … with 985 more rows

And use the felm function from the lfe package to reproduce the regression results above.

felm1 <- felm(y ~ treat | id + t, data = df)
felm2 <- felm(y ~ treat | t, data = df)

df_alpha_hat <-
df %>%
filter(t < 10) %>%
mutate(y0_res = residuals(lm(y ~ as.factor(t), data = .))) %>%
group_by(id) %>%
summarize(alpha_hat = mean(y0_res),
mean_y = mean(y), .groups = "drop")

felm3 <-
df %>%
inner_join(df_alpha_hat, by = "id") %>%
mutate(y = y - mean_y) %>%
felm(y ~ treat | t, data = .)

felm5 <-
df %>%
inner_join(df_alpha_hat, by = "id") %>%
mutate(y = y - alpha_hat) %>%
felm(y ~ treat | t, data = .)
stargazer(list(felm1, felm2, felm3, felm5),
keep.stat = c("n", "rsq"), keep = "treat", type = "html")
 Dependent variable: y (1) (2) (3) (4) treat 2.998*** 3.060*** 2.998*** 2.998*** (0.215) (0.214) (0.195) (0.195) Observations 1,000 1,000 1,000 1,000 R2 0.458 0.340 0.378 0.378 Note: p<0.1; p<0.05; p<0.01