26 Selection models

The code in this chapter uses the packages listed in the code below. For instructions on how to set up your computer to use the code found in this book, see Section 1.2.1.

library(dplyr, warn.conflicts = FALSE)
library(dbplyr, warn.conflicts = FALSE)
library(DBI)
library(sampleSelection)
library(modelsummary)

Quarto templates for the exercises below are available at https://github.com/iangow/far_templates.

26.1 Replication of Jackson et al. (2009)

pg <- dbConnect(RPostgres::Postgres(),
                bigint = "integer",
                check_interrupts = TRUE)

funda <- tbl(pg, Id(schema = "comp", table = "funda"))
company <- tbl(pg, Id(schema = "comp", table = "company"))
aco_pnfnda <- tbl(pg, Id(schema = "comp", table = "aco_pnfnda"))
funda_fncd <- tbl(pg, Id(schema = "comp", table ="funda_fncd"))
dsf <- tbl(pg, Id(schema = "crsp", table = "dsf"))
msf <- tbl(pg, Id(schema = "crsp", table = "msf"))
ccmxpf_lnkhist <- tbl(pg, Id(schema = "crsp", table = "ccmxpf_lnkhist"))

crsp_first_date <- 
  dsf |>
  group_by(permno) |>
  summarize(min_date = min(date, na.rm = TRUE)) |>
  ungroup() 

funda_mod <-
  funda |>
  filter(indfmt == "INDL", datafmt == "STD",
         consol == "C", popsrc == "D") |>
  mutate(sich = as.character(sich)) 

ccm_link <-
  ccmxpf_lnkhist |>
  filter(linktype %in% c("LC", "LU", "LS"),
         linkprim %in% c("C", "P")) |>
  mutate(linkenddt = coalesce(linkenddt, max(linkenddt, na.rm = TRUE))) |>
  rename(permno = lpermno) |>
  select(gvkey, permno, linkdt, linkenddt) 
  
firm_age <-
  funda_mod |>
  inner_join(ccm_link, by = join_by(gvkey, 
                                    datadate >= linkdt,
                                    datadate <= linkenddt)) |>
  inner_join(crsp_first_date, by = "permno") |>
  mutate(age = year(age(datadate, min_date))) |>
  select(gvkey, datadate, age) |>
  collect()

rets <-
  funda_mod |>
  select(gvkey, datadate) |>
  mutate(start_date = datadate - months(12)) |>
  inner_join(ccm_link, 
             by = join_by(gvkey, 
                          datadate >= linkdt,
                          datadate <= linkenddt)) |>
  inner_join(msf, 
             by = join_by(permno, 
                          start_date < date,
                          datadate >= date)) |>
  group_by(gvkey, datadate) |>
  summarize(ret = sum(ln(1 + ret), na.rm = TRUE) - 1,
            n_obs = n(), 
            .groups = "drop") |>
  collect()

sics <- 
  company |>
  select(gvkey, sic)

depr <-
  funda_fncd |>
  mutate(dp_method = 
           case_when(dpact_fn %in% c("TB", "TU") ~ "Mix",
                     dpact_fn %in% c("TC", "TV") ~ "Accel",
                     dpact_fn %in% c("TS", "TX") ~ "SL")) |> 
  filter(indfmt == "INDL", datafmt == "STD",
         consol == "C", popsrc == "D") |>
  select(gvkey, datadate, dp_method) |>
  collect()

pensions <-
  aco_pnfnda |>
  filter(indfmt == "INDL", datafmt == "STD",
         consol == "C", popsrc == "D") |>
  mutate(d_pen = coalesce(pbpro > 0 | pbpru > 0 | !is.na(pbarr), FALSE)) |>
  select(gvkey, datadate, d_pen) |>
  collect()

roas <-
  funda_mod |>
  mutate(roa = if_else(at > 0, ib/at, NA)) |>
  filter(!is.na(roa)) |>
  inner_join(sics, by = "gvkey") |> 
  mutate(sic2 = substr(sic, 1, 2)) |>
  group_by(sic2) |>
  mutate(roa_decile = ntile(roa, 10)) |>
  ungroup() |>
  mutate(d_roa = roa_decile %in% 2:9) |>
  select(gvkey, datadate, d_roa) |>
  collect()

raw_data <-
  funda_mod |>
  select(gvkey, datadate, fyear, xrd, at, ppegt, prcc_f, csho, sale, capx,
         cogs, np, xad, dltt, dpact, ceq, sich, che, oancf) |>
  inner_join(sics, by = "gvkey") |>
  collect() |>
  inner_join(depr, by = c("gvkey", "datadate")) |>
  inner_join(pensions, by = c("gvkey", "datadate")) |>
  inner_join(roas, by = c("gvkey", "datadate")) |>
  inner_join(firm_age, by = c("gvkey", "datadate")) |>
  inner_join(rets, by = c("gvkey", "datadate")) |>
  mutate(sic = coalesce(sich, sic),
         sic3 = as.integer(substr(sic, 1, 3)))

reg_data <-
  raw_data |>
  mutate(mv = prcc_f * csho,
         mb = if_else(ceq > 0, mv/ceq, NA),
         sale = if_else(sale > 0, sale, NA)) |>
  group_by(gvkey) |>
  arrange(datadate) |>
  mutate(adj_at = at + coalesce(dpact, 0),
         lag_at = lag(at),
         lag_adj_at = lag(adj_at),
         lag_capx = lag(capx),
         lag_mb = lag(mb),
         lag_cash = lag(che),
         lag_2_cash = lag(che, 2),
         lag_ret = lag(ret),
         lag_sale = lag(sale)) |>
  ungroup() |>
  mutate(adj_ta = (adj_at + lag_adj_at)/2,
         dur = case_when(between(sic3, 150, 179) ~ TRUE,
                         between(sic3, 250, 259) ~ TRUE,
                         between(sic3, 324, 399) ~ TRUE,
                         sic3 %in% c(245, 283, 301) ~ TRUE,
                         !is.na(sic3) ~ FALSE),
         og = sic3 %in% c(131, 291),
         rd = if_else(adj_ta > 0, coalesce(xrd, 0)/adj_ta, NA),
         labor = if_else(adj_at > 0, 1 - ppegt/adj_at, NA),
         d_mfg = between(sic3, 200, 399),
         d_nmfg = !d_mfg, 
         cgs = if_else(adj_ta > 0, cogs/adj_ta, NA),
         npay = if_else(adj_at > 0, coalesce(np, 0)/adj_at, NA),
         adv = if_else(adj_ta > 0, coalesce(xad, 0)/adj_ta, NA),
         levmv = if_else(mv > 0, coalesce(dltt, 0)/mv, NA),
         levta = if_else(adj_ta > 0, (coalesce(np, 0) + 
                                        coalesce(dltt, 0))/adj_ta, NA),
         log_sale = case_when(sale > 0 & !is.na(sale) ~ log(sale),
                          .default = NA),
         log_lag_sale = if_else(lag_sale > 0 & !is.na(lag_sale), 
                            log(lag_sale), NA),
         choice = dp_method %in% c("Accel", "Mix"),
         capx = if_else(adj_ta > 0, capx/adj_ta, NA),
         lag_capx = if_else(lag_adj_at > 0, lag_capx/lag_adj_at, NA),
         lag_cash = if_else(lag_adj_at > 0, lag_cash/lag_adj_at, NA),
         lag_2_cash = if_else(lag_adj_at > 0, lag_2_cash/lag_adj_at, NA),
         cash = if_else(adj_at > 0, che/adj_at, NA),
         d_sale = if_else(adj_at > 0, (sale - lag_sale)/adj_at, NA),
         d_cash = cash - lag_cash,
         lag_d_cash = lag_cash - lag_2_cash,
         cfo = if_else(adj_at > 0, oancf/adj_at, NA),
         age = if_else(age > 0, log(age), NA),
         size = if_else(adj_at > 0, log(adj_at), NA))

fm_1 <- glm(choice ~ dur + rd + labor + d_pen + cgs * d_mfg + npay + adv + levmv + d_roa + I(og * sale), 
            family = binomial(link = "probit"), 
            data = reg_data)

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

used_data <- reg_data[-na.action(fm_1), ]

mills_data <-
  used_data |>
  mutate(mills = invMillsRatio(fm_1, all = TRUE)$IMR1)

fms <- list()
fms[[1]] <- lm(capx ~ choice + lag_capx + lag_mb + levta + d_cash + lag_d_cash +
             age + size + lag_ret + cfo + d_sale + mills, 
           data = mills_data)

fms[[2]] <- lm(capx ~ choice + lag_capx + lag_mb + levta + d_cash + lag_d_cash +
                 age + size + lag_ret + cfo + d_sale, 
               data = mills_data)

reg_data_2 <-
  reg_data |>
  filter(if_all(c(choice, dur, rd, labor, d_pen, cgs, d_mfg, 
                  npay, adv, levmv, d_roa, og, sale, capx, lag_capx,
                  lag_mb, levta, d_cash, lag_d_cash, age, size, 
                  lag_ret, cfo, d_sale), 
                \(x) !is.na(x))) |>
  as.data.frame()

fms[[3]] <- selection(selection = choice ~ dur + rd + labor + d_pen + cgs * d_mfg + npay + adv + levmv + d_roa + I(og * sale),
                   outcome = capx ~ choice + lag_capx + lag_mb + levta + d_cash + lag_d_cash +
                     age + size + lag_ret + cfo + d_sale,
                   data = reg_data_2)

modelsummary(fm_1,
             estimate = "{estimate}{stars}",
             gof_map = c("nobs", "r.squared", "adj.r.squared"),
             stars = c('*' = .1, '**' = 0.05, '***' = .01))

Table 26.1: First stage

	(1)
(Intercept)	−0.062***
	(0.020)
durTRUE	0.073***
	(0.014)
rd	−1.260***
	(0.137)
labor	−1.063***
	(0.022)
d_penTRUE	−0.232***
	(0.011)
cgs	−0.120***
	(0.011)
d_mfgTRUE	0.141***
	(0.018)
npay	0.655***
	(0.070)
adv	−0.653***
	(0.176)
levmv	−0.065***
	(0.004)
d_roaTRUE	−0.047***
	(0.014)
I(og * sale)	0.000***
	(0.000)
cgs × d_mfgTRUE	0.133***
	(0.018)
Num.Obs.	90608

modelsummary(fms,
             estimate = "{estimate}{stars}",
             gof_map = c("nobs", "r.squared", "adj.r.squared"),
             stars = c('*' = .1, '**' = 0.05, '***' = .01))

Warning: There are duplicate term names in the table.
  The `shape` argument of the `modelsummary` function can be used to print
  related terms together. The `group_map` argument can be used to reorder,
  subset, and rename group identifiers. See `?modelsummary` for details.
  You can find the group identifier to use in the `shape` argument by
  calling `get_estimates()` on one of your models. Candidates include:
  s.value, group, component

Table 26.2: Second stage

	(1)	(2)	(3)
(Intercept)	0.043***	0.025***	−0.102***
	0.043***	0.025***	0.024***
	(0.001)	(0.001)	(0.001)
	(0.001)	(0.001)	(0.028)
choiceTRUE	0.004***	0.005***	0.017***
	(0.000)	(0.000)	(0.001)
lag_capx	0.707***	0.733***	0.721***
	(0.003)	(0.003)	(0.003)
lag_mb	0.000	0.000	0.000
	(0.000)	(0.000)	(0.000)
levta	0.010***	0.007***	0.008***
	(0.001)	(0.001)	(0.001)
d_cash	−0.051***	−0.053***	−0.052***
	(0.002)	(0.002)	(0.002)
lag_d_cash	0.015***	0.014***	0.015***
	(0.002)	(0.002)	(0.002)
age	−0.002***	−0.001***	−0.001***
	(0.000)	(0.000)	(0.000)
size	0.000***	0.000***	0.000***
	(0.000)	(0.000)	(0.000)
lag_ret	0.011***	0.011***	0.011***
	(0.000)	(0.000)	(0.000)
cfo	0.016***	0.018***	0.017***
	(0.001)	(0.001)	(0.001)
d_sale	0.004***	0.004***	0.004***
	(0.000)	(0.000)	(0.000)
mills	−0.012***
	(0.000)
durTRUE			0.140***
			(0.019)
rd			−1.031***
			(0.178)
labor			−1.292***
			(0.029)
d_penTRUE			−0.074***
			(0.018)
cgs			−0.248***
			(0.019)
d_mfgTRUE			0.010
			(0.025)
npay			0.735***
			(0.108)
adv			−0.937***
			(0.245)
levmv			−0.064***
			(0.006)
d_roaTRUE			−0.044**
			(0.019)
I(og * sale)			0.000***
			(0.000)
cgs × d_mfgTRUE			0.182***
			(0.031)
sigma			0.031***
			(0.000)
rho			−0.223***
			(0.014)
Num.Obs.	55406	55407	55407
R2	0.531	0.525
R2 Adj.	0.531	0.525

26.2 Discussion questions

Based on the evidence in Lennox et al. (2012), why do authors use selection models?
How do you interpret the results in Tables 4 and 5 of Lennox et al. (2012)?
What’s going on in Table 6 of Lennox et al. (2012)?
Evaluate the “suggestions” in section VI of Lennox et al. (2012).
Lennox et al. (2012) say:

For example, Bushee et al. (2003) stands outs in terms of justifying the variables in the first and second stage models, reporting sensitivity tests to alternative model specifications, and reporting diagnostic tests for multicollinearity. Moreover, Bushee et al. (2003) clearly identify their exclusion restrictions and they state that they do not have good reason to expect that those Z variables would directly affect the dependent variable in the second stage model.

Discuss the details of what Bushee et al. (2003) do on each of the points mentioned by Lennox et al. (2012).

Which table in Bushee et al. (2003) includes the main results of their second-stage model? What do they find there?
How plausible do you find the idea studied in Jackson et al. (2009)? Suppose you could run a field experiment (e.g., you were appointed head of the relevant regulator and given authority to run experiments) to test these ideas. What choices might you make to get the best experiment?