`aipwML` -- Regression adjustment, IPW, and AIPW estimation

The aipwML package computes causal effects using pscore and outcome models estimated using linear / logistic regression, regularised regression (fit with glmnet), random forests (fit with ranger), and gradient boosted trees (fit with xgboost). It is written to be as modular and possible so that users can specify different choices for the outcome and propensity score models in mhatter and ehatter.

installation

library(remotes)
remotes::install_github("apoorvalal/aipwML")

This writeup demonstrates the estimation functions using the Lalonde observational dataset where experimental controls were replaced with control units from the PSID, and standard estimators are badly biased for the experimental effect of ≈ $1700.

Data Prep

data(lalonde.psid); df = lalonde.psid
y = 're78'; w = 'treat'
x = setdiff(colnames(df), c(y, w))

# outcome model formula
fo = re78 ~ (age + education + black + hispanic + married + nodegree +
    re74 + re75 + u74 + u75)
# pscore formula
fp = treat ~ (age + education + black + hispanic + married + nodegree +
    re74 + re75 + u74 + u75)

We have data {y_i, w_i, x_i}_i = 1^N ∈ ℝ × {0, 1} × ℝ^k. Under selection on observables assumptions, we can compute the ATE by imputing the missing potential outcome.

Regression Adjustment

$$ \hat{\tau}_{\text{reg}}^{\text{ATE}} = \frac{1}{N} \sum_{i=1}^N ( \hat{\mu}_1 (x_i) - \hat{\mu}_0 (x_i) ) $$

regadjusts = c(
  ate_reg('ols',      w = w, y = y, df = df, fml = fo),
  ate_reg('lasso',    w = w, y = y, df = df, fml = fo),
  ate_reg('ridge',    w = w, y = y, df = df, fml = fo),
  ate_reg('rforest',  w = w, y = y, df = df, fml = fo),
  ate_reg('xgboost',  w = w, y = y, df = df, fml = fo)
)
regadjusts |> round(3)

## [1]  -8746 -13824 -12260  -9677 -11623

pretty bad.

Inverse Propensity Weighting (IPW)

$$ \hat{\tau}_{\text{ipw}}^{\text{ATE}} = \frac{1}{N} \sum_{i=1}^N \frac{y_i (w_i - \hat{e}(x_i)) }{\hat{e}(x_i) (1 - \hat{e}(x_i)) } $$

ipws = c(
  ate_ipw('logit',   w = w, y = y, df = df, fml = fp),
  ate_ipw('lasso',   w = w, y = y, df = df, fml = fp),
  ate_ipw('ridge',   w = w, y = y, df = df, fml = fp),
  ate_ipw('rforest', w = w, y = y, df = df, fml = fp),
  ate_ipw('xgboost', w = w, y = y, df = df, fml = fp)
)

ipws |> round(3)

## [1] -10454 -15260 -17703 -19568 -19442

Still pretty bad. Now trim extreme pscores.

psr = c(0.05, 0.95)
ipws2 = c(
  ate_ipw('logit',   w = w, y = y, df = df, fml = fp, psrange = psr),
  ate_ipw('lasso',   w = w, y = y, df = df, fml = fp, psrange = psr),
  ate_ipw('ridge',   w = w, y = y, df = df, fml = fp, psrange = psr),
  ate_ipw('rforest', w = w, y = y, df = df, fml = fp, psrange = psr),
  ate_ipw('xgboost', w = w, y = y, df = df, fml = fp, psrange = psr)
)

ipws2 |> round(3)

## [1] -1356.4 -1144.8 -1623.8   361.6  2971.2

Better.

Augmented IPW

$$ \hat{\tau}_{\mathrm{AIPW}}^{\text{ATE}} = \frac{1}{N} \sum_{i=1}^{N} \left[\left( \hat{m}_{1}\left(x_{i}\right)+\frac{w_{i}}{\hat{e}\left(x_{i}\right)} \left(y_{i}-\hat{m}_{1}\left(x_{i}\right)\right)\right) - \left(\hat{m}_{0}\left(x_{i}\right)+\frac{1-w_{i}}{1-\hat{e}\left(x_{i}\right)} \left(y_{i}-\hat{m}_{0}\left(x_{i}\right)\right)\right)\right] $$

Need to chose how to estimate e and m, so we perform an exhaustive search. For each choice of outcome model, I try every other fitter for the pscore.

OLS outcome model

ols_mean = c(
  ate_aipw(fit_me(meanfn = 'ols', pscorefn = 'logit',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'ols', pscorefn = 'lasso',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'ols', pscorefn = 'ridge',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'ols', pscorefn = 'rforest',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'ols', pscorefn = 'xgboost',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr)
)

Ridge outcome model

ridge_mean = c(
  ate_aipw(fit_me(meanfn = 'ridge', pscorefn = 'logit',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'ridge', pscorefn = 'lasso',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'ridge', pscorefn = 'ridge',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'ridge', pscorefn = 'rforest',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'ridge', pscorefn = 'xgboost',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr)
)

LASSO outcome model

lasso_mean = c(
  ate_aipw(fit_me(meanfn = 'lasso', pscorefn = 'logit',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'lasso', pscorefn = 'lasso',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'lasso', pscorefn = 'ridge',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'lasso', pscorefn = 'rforest',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'lasso', pscorefn = 'xgboost',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr)
)

Random Forest outcome model

rforest_mean = c(
  ate_aipw(fit_me(meanfn = 'rforest', pscorefn = 'logit',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'rforest', pscorefn = 'lasso',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'rforest', pscorefn = 'ridge',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'rforest', pscorefn = 'rforest',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'rforest', pscorefn = 'xgboost',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr)
)

GBM outcome model

xgboost_mean = c(
  ate_aipw(fit_me(meanfn = 'xgboost', pscorefn = 'logit',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'xgboost', pscorefn = 'lasso',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'xgboost', pscorefn = 'ridge',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'xgboost', pscorefn = 'rforest',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr),
  ate_aipw(fit_me(meanfn = 'xgboost', pscorefn = 'xgboost',
    mean_fml = fo, psc_fml = fp, y = y, w = w, df = df), psrange = psr)
)

AIPW table

# stack estimates
aipw_estimates = rbind(ols_mean, lasso_mean, ridge_mean, rforest_mean, xgboost_mean)
colnames(aipw_estimates) = c('PS: logit', 'PS: lasso', 'PS: ridge',
  'PS: rforest', 'PS: xgboost')
rownames(aipw_estimates)= c('Outcome: ols', 'Outcome: lasso', 'Outcome: ridge',
  'Outcome: rforest', 'Outcome: xgboost')
aipw_estimates |> kbl() %>%
  kable_styling()

	PS: logit	PS: lasso	PS: ridge	PS: rforest	PS: xgboost
Outcome: ols	56.49	1.199	-519.4	439.2	3227
Outcome: lasso	-236.96	-74.295	-909.8	558.7	3213
Outcome: ridge	-245.79	-478.324	-620.9	189.8	3247
Outcome: rforest	-197.46	110.549	-355.1	764.2	3202
Outcome: xgboost	-320.45	6.479	-879.7	307.1	3249

A relatively stable row or column in the above table suggests that we got one of the two nuisance functions ‘right’. In this case, it looks like the GBM pscore function yields stable estimates across all choices of outcome models.

Manual use for inference, other estimands

the fit_me functions fits the m functions and e function for each observation and returns a dataset that can then be used for manual calculations.

library(data.table)
fit_mod = fit_me(meanfn = 'xgboost', pscorefn = 'xgboost',
    mean_fml = fo, psc_fml  = fp, y = y, w = w, df = df)
setDT(fit_mod)
fit_mod |> head()

##          y     w    m0      m1     eh
##      <num> <num> <num>   <num>  <num>
## 1:  9930.0     1  9017  9929.8 0.9657
## 2:  3595.9     1 11787  3593.9 1.0012
## 3: 24909.5     1  3748 24906.3 0.9887
## 4:  7506.1     1  8709  2685.8 1.0012
## 5:   289.8     1  2021   291.1 0.9965
## 6:  4056.5     1  6751  4060.6 0.9889

trim extreme pscores before AIPW

fit_mod |> ate_aipw(c(0.1, 0.9)) |> round(3)

## [1] 2893

bootstrap

library(boot); library(MASS)
boot.fn <- function(data, ind){
  d = data[ind, ]
  fit_mod = fit_me(meanfn = 'lasso', pscorefn = 'lasso',
    mean_fml = fo, psc_fml  = fp, y = y, w = w, df = d) |>
    ate_aipw(c(0.1, 0.9))
}
out = boot(df, boot.fn, R = 100)
out |> print()

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = df, statistic = boot.fn, R = 100)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*      342    -101        1023

ATT

the ATT subsets to treated units and computes the average between the realised Y and imputed Y(0), which can be done easily with our estimates.

fit_mod[w == 1, mean(y - m0)] |> round(3)

## [1] 1796

apoorvalal / aipwml Goto Github PK

aipwml's Introduction

aipwML -- Regression adjustment, IPW, and AIPW estimation

installation

Data Prep

Regression Adjustment

Inverse Propensity Weighting (IPW)

Augmented IPW

OLS outcome model

Ridge outcome model

LASSO outcome model

Random Forest outcome model

GBM outcome model

AIPW table

Manual use for inference, other estimands

trim extreme pscores before AIPW

bootstrap

ATT

aipwml's People

Contributors

Stargazers

Watchers

aipwml's Issues

Recommend Projects

Recommend Topics

Recommend Org

`aipwML` -- Regression adjustment, IPW, and AIPW estimation