Simulation of Censored Data
set.seed(123456789)
N <- 500
a <- 2
b <- -3
x <- runif(N)
u <- rnorm(N)
y_star <- a + b * x + u
# y <- pmax(y_star, 0)
y <- ifelse(y_star > 0, y_star, 0)
lm1 <- lm(y ~ x)
lm1$coefficients[2]
## x
## -2.026422
lm(y[x < 0.6] ~ x[x < 0.6])$coefficients[2]
## x[x < 0.6]
## -2.993644
## [1] 291
## [1] 291
glm(y > 0 ~ x, family = binomial(link = "probit"))$coefficients
## (Intercept) x
## 2.015881 -3.013131
# 6.2.5 Tobit estimator in R ----
f_tobit <- function(par, y, X) {
X <- cbind(1, X)
sigma <- exp(par[1]) # Use exp() to keep value positive
beta <- par[-1]
z <- (y - X %*% beta)/sigma
log_lik <- sum((y == 0) * log(pnorm(z)) +
(y > 0) * (log(dnorm(z)) - log(sigma)))
# Return negative because we are minimizing
return(-log_lik)
}
par1 <- c(0, lm1$coefficients)
a1 <- optim(par = par1, fn = f_tobit, y = y, X = x)
# sigma
exp(a1$par[1])
##
## 0.9980347
## (Intercept) x
## 2.010674 -3.012778
# Check against AER
AER::tobit(y ~ x)
##
## Call:
## AER::tobit(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 2.011 -3.013
##
## Scale: 0.9979
6.4.2 Simulation of a selection model
library(mvtnorm)
set.seed(123456789)
N <- 100
a <- 6
b <- -3
c <- 4
d <- -5
mu <- c(0, 0)
Sigma <- rbind(c(1, 0.5), c(0.5, 1))
df <-
tibble(x = runif(N),
z = runif(N),
u = rmvnorm(N, mean = mu, sigma = Sigma),
y = ifelse(c + d*z + u[, 1] > 0, a + b * x + u[, 2], 0))
# OLS model
lm1 <- lm(y ~ x, data = df)
lm2 <- update(lm1, subset = z < 0.6)
glm1 <- glm(y > 0 ~ z, family = binomial(link = "probit"), data = df)
stargazer(list(lm1, lm2, glm1), type = "text", keep.stat = c("n", "rsq"))
##
## ==========================================
## Dependent variable:
## -----------------------------
## y y > 0
## OLS probit
## (1) (2) (3)
## ------------------------------------------
## x -1.314* -3.358***
## (0.741) (0.517)
##
## z -4.905***
## (1.150)
##
## Constant 4.397*** 6.039*** 4.025***
## (0.426) (0.301) (0.869)
##
## ------------------------------------------
## Observations 100 64 100
## R2 0.031 0.405
## ==========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Heckman estimator in R
f_heckman <- function(par, y, X_in, Z_in = X_in) {
X <- cbind(1, X_in)
Z <- cbind(1, Z_in)
rho <- exp(par[1])/(1 + exp(par[1]))
# This is the sigmoid function.
# Note that in fact rho is between -1 and 1
k <- dim(X)[2]
beta <- par[2:(2 + k -1)]
gamma <- par[(2 + k):length(par)]
Xb <- X %*% beta
Zg <- Z %*% gamma
Zg_adj <- (Zg + rho * (y - Xb))/((1 - rho^2)^(.5))
log_lik <- (y == 0) * log(pnorm(-Zg)) +
(y > 0) * (log(pnorm(Zg_adj)) + log(dnorm(y - Xb)))
return(-sum(log_lik))
}
f_heckman_alt <- function(par, y, X_in, Z_in = X_in) {
X <- cbind(1, X_in)
Z <- cbind(1, Z_in)
# This is the sigmoid function.
# Note that in fact rho is between -1 and 1
rho <- exp(par[1])/(1 + exp(par[1]))
sigma <- exp(par[2])
k <- dim(X)[2]
beta <- par[3:(3 + k - 1)]
gamma <- par[(3 + k):length(par)]
Zg <- Z %*% gamma
Xb <- X %*% beta
u2 <- y - Xb
B <- (Zg + rho/sigma*u2)/sqrt(1 - rho^2)
log_lik <- ifelse(y == 0, pnorm(-Zg, log.p=TRUE),
-1/2*log(2*pi) - log(sigma) +
(pnorm(B, log.p=TRUE) - 0.5*(u2/sigma)^2))
return(-sum(log_lik))
}
f_heckman_sigma_1 <- function(par, y, X_in, Z_in) {
par_alt <- c(par[1], 0, par[-1])
f_heckman_alt(par_alt, y, X_in, Z_in)
}
par1 <- c(0, lm1$coefficients, glm1$coefficients)
a1 <- optim(par = par1, fn = f_heckman, y = df$y, X = df$x, Z = df$z)
exp(a1$par[1])/(1 + exp(a1$par[1]))
##
## 0.5357194
## x (Intercept) z
## 0.1431213 -2.6569080 4.4067429 -5.4885393
a3 <- optim(par = par1, fn = f_heckman_sigma_1,
y = df$y, X = df$x, Z = df$z)
exp(a3$par[1])/(1 + exp(a3$par[1]))
##
## 0.5357194
## (Intercept) x (Intercept) z
## 5.831804 -2.656908 4.406743 -5.488539
par2 <- c(0, 1, lm1$coefficients, glm1$coefficients)
a2 <- optim(par = par2, fn = f_heckman_alt,
y = df$y, X = df$x, Z = df$z)
exp(a2$par[1])/(1 + exp(a2$par[1]))
##
## 0.681301
## (Intercept) x (Intercept) z
## 0.1876142 5.7687675 -2.6382857 4.1876889 -5.1778774
par2 <- c(0, 1, lm1$coefficients, glm1$coefficients)
a2 <- optim(par = par2, fn = f_heckman_alt, y = df$y, X = df$x, Z = df$z,
control = list(maxit = 1000, reltol = 1e-15))
exp(a2$par[1])/(1 + exp(a2$par[1]))
##
## 0.6788183
## (Intercept) x (Intercept) z
## 0.1874499 5.7678676 -2.6330984 4.1970042 -5.1898630
heckit <- selection((y > 0) ~ z, y ~ x, data = df, method = "ml",
maxMethod = "NM")
summary(heckit)
## --------------------------------------------
## Tobit 2 model (sample selection model)
## Maximum Likelihood estimation
## Nelder-Mead maximization, 501 iterations
## Return code 1: iteration limit exceeded
## Log-Likelihood: -154.0247
## 100 observations (18 censored and 82 observed)
## 6 free parameters (df = 94)
## Probit selection equation:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.1846 0.9044 4.627 1.18e-05 ***
## z -5.1593 1.2000 -4.299 4.18e-05 ***
## Outcome equation:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.7784 0.2790 20.713 < 2e-16 ***
## x -2.6434 0.4636 -5.702 1.36e-07 ***
## Error terms:
## Estimate Std. Error t value Pr(>|t|)
## sigma 1.20607 0.09848 12.246 < 2e-16 ***
## rho 0.68117 0.19222 3.544 0.000617 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## --------------------------------------------
## (Intercept) z (Intercept) x sigma rho
## 4.1846081 -5.1593433 5.7784220 -2.6433702 1.2060677 0.6811696