Chapter 6 Estimating Selection Models

6.1 Introduction

library(lmwr)
library(dplyr, warn.conflicts = FALSE)
library(ggplot2)
library(stargazer)
library(AER)
library(sampleSelection)

6.2 Modelling Censored Data

6.2.1 A Model of Censored Data

6.2.2 Simulation of Censored Data

set.seed(123456789)

N <- 500
a <- 2
b <- -3
x <- runif(N)
u <- rnorm(N)
y_star <- a + b * x + u
# y <- pmax(y_star, 0)
y <- ifelse(y_star > 0, y_star, 0)
lm1 <- lm(y ~ x)
lm1$coefficients[2]
##         x 
## -2.026422
lm(y[x < 0.6] ~ x[x < 0.6])$coefficients[2]
## x[x < 0.6] 
##  -2.993644
length(x[x < 0.6])
## [1] 291
sum(x < 0.6)
## [1] 291
glm(y > 0 ~ x, family = binomial(link = "probit"))$coefficients
## (Intercept)           x 
##    2.015881   -3.013131
# 6.2.5 Tobit estimator in R ----
f_tobit <- function(par, y, X) {
  X <- cbind(1, X)
  sigma <- exp(par[1]) # Use exp() to keep value positive
  beta <- par[-1]
  
  z <- (y - X %*% beta)/sigma
  
  log_lik <- sum((y == 0) * log(pnorm(z)) +
                    (y > 0) * (log(dnorm(z)) - log(sigma)))
  
  # Return negative because we are minimizing
  return(-log_lik)
}
    
par1 <- c(0, lm1$coefficients)
a1 <- optim(par = par1, fn = f_tobit, y = y, X = x)

# sigma
exp(a1$par[1])
##           
## 0.9980347
a1$par[-1]
## (Intercept)           x 
##    2.010674   -3.012778
# Check against AER
AER::tobit(y ~ x)
## 
## Call:
## AER::tobit(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##       2.011       -3.013  
## 
## Scale: 0.9979

6.2.3 6.4.2 Simulation of a selection model

library(mvtnorm)
set.seed(123456789)
N <- 100
a <- 6
b <- -3
c <- 4
d <- -5

mu <- c(0, 0)
Sigma <- rbind(c(1, 0.5), c(0.5, 1))


df <- 
  tibble(x = runif(N),
         z = runif(N),
         u = rmvnorm(N, mean = mu, sigma = Sigma),
         y = ifelse(c + d*z + u[, 1] > 0, a + b * x + u[, 2], 0))

# OLS model
lm1 <- lm(y ~ x, data = df)
lm2 <- update(lm1, subset = z < 0.6)
glm1 <- glm(y > 0 ~ z, family = binomial(link = "probit"), data = df)
stargazer(list(lm1, lm2, glm1), type = "text", keep.stat = c("n", "rsq"))
## 
## ==========================================
##                   Dependent variable:     
##              -----------------------------
##                       y            y > 0  
##                      OLS          probit  
##                 (1)       (2)       (3)   
## ------------------------------------------
## x             -1.314*  -3.358***          
##               (0.741)   (0.517)           
##                                           
## z                                -4.905***
##                                   (1.150) 
##                                           
## Constant     4.397***  6.039***  4.025*** 
##               (0.426)   (0.301)   (0.869) 
##                                           
## ------------------------------------------
## Observations    100       64        100   
## R2             0.031     0.405            
## ==========================================
## Note:          *p<0.1; **p<0.05; ***p<0.01

6.2.4 Heckman estimator in R

f_heckman <- function(par, y, X_in, Z_in = X_in) {
  
  X <- cbind(1, X_in)
  Z <- cbind(1, Z_in)
  rho <- exp(par[1])/(1 + exp(par[1]))
  
  # This is the sigmoid function.
  # Note that in fact rho is between -1 and 1
  k <- dim(X)[2]
  beta <- par[2:(2 + k -1)]
  gamma <- par[(2 + k):length(par)]
  
  Xb <- X %*% beta
  Zg <- Z %*% gamma
  
  Zg_adj <- (Zg + rho * (y - Xb))/((1 - rho^2)^(.5))
  log_lik <- (y == 0) * log(pnorm(-Zg)) +
    (y > 0) * (log(pnorm(Zg_adj)) + log(dnorm(y - Xb)))
  
  return(-sum(log_lik))
}

f_heckman_alt <- function(par, y, X_in, Z_in = X_in) {
  
  X <- cbind(1, X_in)
  Z <- cbind(1, Z_in)
  
  # This is the sigmoid function.
  # Note that in fact rho is between -1 and 1
  rho <- exp(par[1])/(1 + exp(par[1]))
  sigma <- exp(par[2])
  
  k <- dim(X)[2]
  beta <- par[3:(3 + k - 1)]
  gamma <- par[(3 + k):length(par)]
  
  Zg <- Z %*% gamma
  Xb <- X %*% beta
  u2 <- y - Xb

  B <- (Zg + rho/sigma*u2)/sqrt(1 - rho^2)
  
  log_lik <- ifelse(y == 0, pnorm(-Zg, log.p=TRUE),
                    -1/2*log(2*pi) - log(sigma) +
                      (pnorm(B, log.p=TRUE) - 0.5*(u2/sigma)^2))
  
  return(-sum(log_lik))
}

f_heckman_sigma_1 <- function(par, y, X_in, Z_in) {
  par_alt <- c(par[1], 0, par[-1])
  f_heckman_alt(par_alt, y, X_in, Z_in)
}

par1 <- c(0, lm1$coefficients, glm1$coefficients)
a1 <- optim(par = par1, fn = f_heckman, y = df$y, X = df$x, Z = df$z)
exp(a1$par[1])/(1 + exp(a1$par[1]))
##           
## 0.5357194
a1$par[-2]
##                       x (Intercept)           z 
##   0.1431213  -2.6569080   4.4067429  -5.4885393
a3 <- optim(par = par1, fn = f_heckman_sigma_1,
            y = df$y, X = df$x, Z = df$z)
exp(a3$par[1])/(1 + exp(a3$par[1]))
##           
## 0.5357194
a3$par[-1]
## (Intercept)           x (Intercept)           z 
##    5.831804   -2.656908    4.406743   -5.488539
par2 <- c(0, 1, lm1$coefficients, glm1$coefficients)
a2 <- optim(par = par2, fn = f_heckman_alt,
            y = df$y, X = df$x, Z = df$z)
exp(a2$par[1])/(1 + exp(a2$par[1]))
##          
## 0.681301
a2$par[-1]
##             (Intercept)           x (Intercept)           z 
##   0.1876142   5.7687675  -2.6382857   4.1876889  -5.1778774
par2 <- c(0, 1, lm1$coefficients, glm1$coefficients)
a2 <- optim(par = par2, fn = f_heckman_alt, y = df$y, X = df$x, Z = df$z,
            control = list(maxit = 1000, reltol = 1e-15))
exp(a2$par[1])/(1 + exp(a2$par[1]))
##           
## 0.6788183
a2$par[-1]
##             (Intercept)           x (Intercept)           z 
##   0.1874499   5.7678676  -2.6330984   4.1970042  -5.1898630
heckit <- selection((y > 0) ~ z, y ~ x, data = df, method = "ml",
                    maxMethod = "NM")
summary(heckit)
## --------------------------------------------
## Tobit 2 model (sample selection model)
## Maximum Likelihood estimation
## Nelder-Mead maximization, 501 iterations
## Return code 1: iteration limit exceeded 
## Log-Likelihood: -154.0247 
## 100 observations (18 censored and 82 observed)
## 6 free parameters (df = 94)
## Probit selection equation:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.1846     0.9044   4.627 1.18e-05 ***
## z            -5.1593     1.2000  -4.299 4.18e-05 ***
## Outcome equation:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.7784     0.2790  20.713  < 2e-16 ***
## x            -2.6434     0.4636  -5.702 1.36e-07 ***
##    Error terms:
##       Estimate Std. Error t value Pr(>|t|)    
## sigma  1.20607    0.09848  12.246  < 2e-16 ***
## rho    0.68117    0.19222   3.544 0.000617 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## --------------------------------------------
heckit$estimate
## (Intercept)           z (Intercept)           x       sigma         rho 
##   4.1846081  -5.1593433   5.7784220  -2.6433702   1.2060677   0.6811696

6.3 Analysing the Gender Wage Gap

6.3.1 NLSY97 data

nlsy97 <- na.omit(nlsy97)

nlsy97 %>%
  filter(lftwage > 0) %>%
  ggplot(aes(x = lftwage, color = female)) +
  geom_density()

lm3 <- lm(lftwage ~ female + age + age2 + black + college + south + msa,
          data = nlsy97)
lm1 <- update(lm3, subset = !female)
lm2 <- update(lm3, subset = female)
Dependent variable:
lftwage
(1) (2) (3)
female -0.660***
(0.045)
age 0.990 0.430 0.292
(1.148) (0.807) (0.672)
age2 -0.017 -0.008 -0.004
(0.023) (0.016) (0.013)
black -0.660*** 0.020 -0.255***
(0.083) (0.057) (0.048)
college 0.547*** 0.620*** 0.596***
(0.133) (0.082) (0.072)
south 0.053 -0.025 -0.012
(0.080) (0.055) (0.047)
msa 0.030 0.038 0.048
(0.143) (0.120) (0.093)
Constant -12.470 -5.341 -3.297
(14.382) (10.084) (8.412)
Observations 1,076 1,555 2,631
R2 0.097 0.042 0.115
Note: p<0.1; p<0.05; p<0.01