Chapter 10 Panel Data

10.1 First differences

10.1.1 First difference model

10.1.2 Simulated panel data

library(dplyr, warn.conflicts = FALSE)
library(lmwr)
library(tidyr)
library(stargazer)
library(lfe) 
set.seed(123456789)
N <- 1000
T <- 2
a <- -2
b <- 3
d <- -4
u <- c(5, 6)
x1 <- runif(N)
x2 <- x1 + runif(N)
x <- as.matrix(rbind(x1, x2))
e <- matrix(rnorm(N * T), nrow = T)
y <- a + b * x + d * u + e
diffy <- y[2, ] - y[1, ]
diffx <- x[2, ] - x[1, ]
lm1 <- lm(diffy ~ diffx)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.77 0.09 -41.84 0.00
diffx 2.75 0.15 17.83 0.00

If we wanted data in a tidy data frame, we could use functions from tidyr to rearrange the matrices and vectors above into a tidy data frame with and id column for each individual and a time column for periods \(1\) and \(2\). However, as the following code suggests, while the code works, this approach is quite unwieldy.

df <- 
  tibble(id = 1:(dim(y)[2]), y_1 = y[1,], y_2 = y[2, ], 
             x_1 = x[1, ], x_2 = x[2,]) %>% 
  pivot_longer(cols = y_1:x_2) %>%
  separate(name, into = c("var", "time"), sep = "_", 
           convert = TRUE) %>%
  pivot_wider(id_cols = c("id", "time"), names_from = "var",
              values_from = "value")

df
## # A tibble: 2,000 x 4
##       id  time     y     x
##    <int> <int> <dbl> <dbl>
##  1     1     1 -20.5 0.693
##  2     1     2 -22.4 1.30 
##  3     2     1 -21.0 0.673
##  4     2     2 -22.2 1.41 
##  5     3     1 -19.3 0.654
##  6     3     2 -22.2 1.31 
##  7     4     1 -19.7 0.719
##  8     4     2 -20.6 1.42 
##  9     5     1 -21.0 0.922
## 10     5     2 -19.4 1.83 
## # … with 1,990 more rows

A better approach is probably to build up the data using a “tidy” approach from the outset. Note that the arrange below is only needed to ensure that the results line up with those above and could be omitted in general.

set.seed(123456789)

df_pre <- tibble(id = 1:N, time = 1L, x = runif(N), u = 5)
df_post <- tibble(id = 1:N, time = 2L, x = df_pre$x + runif(N), u = 6)

df_all <-
  bind_rows(df_pre, df_post) %>%
  arrange(id, time) %>%
  mutate(e = rnorm(N * T)) %>%
  mutate(y = a + b * x + d * u + e) %>%
  select(-u, -e)

df_all
## # A tibble: 2,000 x 4
##       id  time     x     y
##    <int> <int> <dbl> <dbl>
##  1     1     1 0.693 -20.5
##  2     1     2 1.30  -22.4
##  3     2     1 0.673 -21.0
##  4     2     2 1.41  -22.2
##  5     3     1 0.654 -19.3
##  6     3     2 1.31  -22.2
##  7     4     1 0.719 -19.7
##  8     4     2 1.42  -20.6
##  9     5     1 0.922 -21.0
## 10     5     2 1.83  -19.4
## # … with 1,990 more rows

10.1.3 OLS estimation of first differences

df %>%
  group_by(id) %>%
  arrange(time) %>%
  mutate(diffy = y - lag(y),
         diffx = x - lag(x)) %>%
  lm(diffy ~ diffx, data = .)
## 
## Call:
## lm(formula = diffy ~ diffx, data = .)
## 
## Coefficients:
## (Intercept)        diffx  
##      -3.771        2.748

10.2 Difference in difference

10.2.1 Difference-in-difference estimator

10.2.2 Difference-in-difference estimator in R

The following version of the f_did function substracts four lines from that in the book.

f_did <- function(y, treat) {
  y1 <- y[1, ]
  y2 <- y[2, ]
  did <- matrix(NA, 3, 3)
  did[1, 1] <- mean(y2[treat==1])
  did[2, 1] <- mean(y1[treat==1])
  did[1, 2] <- mean(y2[treat==0])
  did[2, 2] <- mean(y1[treat==0])
  did[3, ] <- did[1, ] - did[2, ]
  did[ , 3] <- did[ , 1] - did[ , 2]
  row.names(did) <- c("Period 2", "Period 1", "Diff")
  colnames(did) <- c("Treated", "Not Treated", "Diff")
  return(did)
}
set.seed(123456789)
treat <- runif(N) < 0.5
x2 <- x1 + treat
x <- rbind(x1, x2)
y <- a + b * x + d * u + e
did1 <- f_did(y, treat)
Treated Not Treated Diff
Period 2 -22.16 -23.66 1.50
Period 1 -21.25 -19.77 -1.49
Diff -0.90 -3.89 2.99

10.3 Minimum wage increase in New Jersey

Here I was too lazy to do the “base R” version. Note that Adams (2021) states “Card and Krueger (1994) present results on a measure called full-time equivalence. It is unclear how that measure is calculated.” Actually if you go to the website for the data, you can see from the provided SAS code that it is calculated as below. I don’t quite reproduce the results in Table 3 of the paper, but I get very close.

10.3.1 Data from Card and Krueger (1994)

We used cardkrueger from lmwr.

nj_min_pre <- 
  cardkrueger %>%
  select(sheet, empft, emppt, state, nmgrs) %>%
  rename(ft = empft, pt = emppt) %>%
  mutate(time = 1) 

nj_min_post <- 
  cardkrueger %>%
  select(sheet, empft2, emppt2, state, nmgrs2) %>%
  rename(ft = empft2, pt = emppt2, nmgrs = nmgrs2) %>%
  mutate(time = 2)

nj_min <- 
  bind_rows(nj_min_pre, nj_min_post) %>%
  mutate(total = ft + pt,
         fte = ft + pt * 0.5 + nmgrs,
         post = time == 2, 
         treat = state == 1) %>%
  group_by(sheet) %>% 
  mutate(balanced = sum(!is.na(total)) == 2) %>%
  ungroup() 

10.3.2 Difference-in-Difference Estimates

fms <- list(lm(total ~ post * treat, data = nj_min),
            lm(total ~ post * treat, data = nj_min, subset = balanced),
            lm(fte ~ post * treat, data = nj_min),
            lm(fte ~ post * treat, data = nj_min, subset = balanced))
Dependent variable:
total fte
(1) (2) (3) (4)
post -2.173 -2.237 -2.166 -2.239
(1.952) (1.986) (1.516) (1.542)
treat -3.286** -3.309** -2.892** -2.952**
(1.531) (1.565) (1.194) (1.219)
postTRUE:treat 2.487 2.440 2.754 2.759
(2.173) (2.213) (1.688) (1.719)
Constant 29.692*** 29.711*** 23.331*** 23.380***
(1.376) (1.404) (1.072) (1.094)
Observations 801 782 794 775
R2 0.006 0.006 0.007 0.008
Note: p<0.1; p<0.05; p<0.01

10.4 Fixed Effects

10.4.1 Fixed-Effects Estimator

10.4.2 Nuisance Parameter

10.4.3 Adjusted Fixed-Effects Estimator

10.4.4 Two-Step Fixed-Effects Estimator

10.4.5 Fixed Effects Estimator in R

Note that the code in the book incorrectly describes XT and XC (“in each row” and “in each column” are flipped).

f_fe <- function(Y, X = NULL, cross = TRUE) {
  Y <- as.matrix(Y)
  T <- dim(Y)[1]
  N <- dim(Y)[2]
  
  # Creates a T x N matrix with numbers 1 to T in each column
  XT <- matrix(rep(c(1:T), N), nrow = T)
  y <- as.vector(Y)
  t <- as.vector(XT)
  
  # Set up for different cases
  if (cross) {
    # Create cross-section dummies
    
    # Creates a T x N matrix with 1 to N in each row
    XC <- t(matrix(rep(c(1:N), T), nrow = N))
    c <- as.vector(XC)
  }
  
  if (!is.null(X)) {
    # Create treatment variable
    X <- as.matrix(X)
    treat <- as.vector(X)
  }
  
  # Estimator
  if (cross & !is.null(X)) {
    # Standard case
    lm1 <- lm(y ~ treat + as.factor(t) + as.factor(c))
  } else if (!is.null(X)) {
    # No cross-section
    lm1 <- lm(y ~ treat + as.factor(t))
  } else {
    # No treatment
    lm1 <- lm(y ~ as.factor(t))
  }
  return(lm1)
}

The simulated panel data set has 100 individuals observed over 10 time periods.

set.seed(123456789)
N <- 100
T <- 10
alpha <- runif(N)
gamma <- runif(T)
beta <- 3
epsilon <- matrix(rnorm(N * T), nrow = T)
treat <- runif(N) < 0.5
y <- t(matrix(rep(alpha, T), nrow = N)) + gamma + epsilon
y[1, ] <- y[1, ] + beta * treat
treat1 <- matrix(0, T, N)
treat1[1, ] <- treat
# Standard estimator
lm1 <- f_fe(y, treat1)

# No invidividual fixed effects
lm2 <- f_fe(y, treat1, cross = FALSE)

# Adjusted estimator
y0 <- y[2:T, ] 
alpha_hat <- colMeans(y0) # Calculate alpha
y2 <- y - t(matrix(rep(alpha_hat, T), nrow = N))
lm3 <- f_fe(y2, treat1, cross = FALSE)

# Two-step estimator
lm4 <- f_fe(y0, cross = FALSE)
y0_res <- matrix(lm4$residuals, nrow = T - 1)
alpha_hat <- colMeans(y0_res) # Calculate alpha
y3 <- y - t(matrix(rep(alpha_hat, T), nrow = N)) # Adjust outcome
lm5 <- f_fe(y3, treat1, cross = FALSE)
stargazer(list(lm1, lm2, lm3, lm5),
          keep.stat = c("n", "rsq"), keep = "treat", type = "html")
Dependent variable:
y
(1) (2) (3) (4)
treat 2.998*** 3.060*** 2.998*** 2.998***
(0.215) (0.214) (0.195) (0.195)
Observations 1,000 1,000 1,000 1,000
R2 0.458 0.340 0.378 0.378
Note: p<0.1; p<0.05; p<0.01

Now, rearrange the data into a “tidy” form.

df <-
  tibble(y = as.vector(y), 
         treat = as.vector(treat1),
         id = as.vector(t(matrix(rep(c(1:N), T), nrow = N))),
         t = as.vector(rep(c(T:1), N)))

df %>% print(n = 15)
## # A tibble: 1,000 x 4
##         y treat    id     t
##     <dbl> <dbl> <int> <int>
##  1 -0.516     0     1    10
##  2  1.11      0     1     9
##  3  2.73      0     1     8
##  4 -0.916     0     1     7
##  5  3.10      0     1     6
##  6  0.562     0     1     5
##  7  1.29      0     1     4
##  8  0.720     0     1     3
##  9  1.06      0     1     2
## 10  0.210     0     1     1
## 11  1.87      0     2    10
## 12  2.34      0     2     9
## 13  3.24      0     2     8
## 14  2.14      0     2     7
## 15  1.12      0     2     6
## # … with 985 more rows

And use the felm function from the lfe package to reproduce the regression results above.

felm1 <- felm(y ~ treat | id + t, data = df)
felm2 <- felm(y ~ treat | t, data = df)

df_alpha_hat <-
  df %>% 
  filter(t < 10) %>%
  mutate(y0_res = residuals(lm(y ~ as.factor(t), data = .))) %>%
  group_by(id) %>%
  summarize(alpha_hat = mean(y0_res),
            mean_y = mean(y), .groups = "drop")

felm3 <-
  df %>%
  inner_join(df_alpha_hat, by = "id") %>%
  mutate(y = y - mean_y) %>%
  felm(y ~ treat | t, data = .)
  
felm5 <-
  df %>%
  inner_join(df_alpha_hat, by = "id") %>%
  mutate(y = y - alpha_hat) %>%
  felm(y ~ treat | t, data = .)
stargazer(list(felm1, felm2, felm3, felm5),
          keep.stat = c("n", "rsq"), keep = "treat", type = "html")
Dependent variable:
y
(1) (2) (3) (4)
treat 2.998*** 3.060*** 2.998*** 2.998***
(0.215) (0.214) (0.195) (0.195)
Observations 1,000 1,000 1,000 1,000
R2 0.458 0.340 0.378 0.378
Note: p<0.1; p<0.05; p<0.01