Example: Estimating ATEs with DRLearner¶
Motivation¶
Estimating average treatment effects (ATEs) involves a subset of the tasks involved in estimating Conditional Average Treatment Effects (CATEs), so we can use methods that are designed for estimating CATEs to estimate ATEs.
In this example, we simulate some data with a confounded binary treatment and demonstrate the average_treatment_effect method of the DRLearner class, which estimates the ATE, and compare it to estimates from some other popular libraries (econML and doubleML). We then show how this approach generalizes to a setting with multiple discrete treatments, and finally to a setting with discrete-valued outcomes.
Example¶
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
np.random.seed(42)
Binary Treatment with confounding¶
def binary_dgp(n, k, pscore_fn, tau_fn, outcome_fn, k_cat=1):
"""DGP for a confounded treatment assignment dgp
Args:
n (int): sample size
k (int): number of continuous covariates
pscore_fn (lambda): propensity score function
tau_fn (lambda): treatment effect function. Can be scalar for constant effect.
outcome_fn (lambda): outcome DGP
"""
Sigma = np.random.uniform(-1, 1, (k, k))
Sigma = Sigma @ Sigma.T
Xnum = np.random.multivariate_normal(np.zeros(k), Sigma, n)
# generate categorical variables
Xcat = np.random.binomial(1, 0.5, (n, k_cat))
X = np.c_[Xnum, Xcat]
W = np.random.binomial(1, pscore_fn(X), n)
Y = outcome_fn(X, W, tau_fn)
df = pd.DataFrame(
np.c_[Y, W, X], columns=["Y", "W"] + [f"X{i}" for i in range(k + 1)]
)
return df
def outcome_fn(x, w, taufn):
return (
taufn(x) * w
+ x[:, 0]
+ 2 * x[:, 1] ** 2
+ 3 * x[:, 3] * x[:, 1]
+ x[:, 2]
+ x[:, 3]
+ np.random.normal(0, 1, n)
)
n, k = 10_000, 3
pscore_fn = lambda x: 1 / (1 + np.exp(-x[:, 0] - x[:, 1] - x[:, 2] ** 2 + x[:, 3]))
# simulate constant_effects
# tau_fn = lambda x: 1 + 2 * x[:, 0] + 3 * x[:, 1] + 4 * x[:, 2] + 5 * x[:, 3]
tau_fn = lambda x: 1
df = binary_dgp(n, k, pscore_fn, tau_fn=tau_fn, outcome_fn=outcome_fn)
outcome_column, treatment_column = "Y", "W"
feature_columns = [f"X{i}" for i in range(k + 1)]
Linear Regression¶
naive_lm = smf.ols(f"{outcome_column} ~ {treatment_column}", df) .fit(cov_type="HC1")
naive_est = naive_lm.params.iloc[1], naive_lm.bse.iloc[1]
naive_est
(2.083595103597772, 0.06526671583747425)
covaradjust_lm = smf.ols(f"{outcome_column} ~ {treatment_column}+{'+'.join(feature_columns)}",
df) .fit(cov_type="HC1")
linreg_est = covaradjust_lm.params.iloc[1], covaradjust_lm.bse.iloc[1]
linreg_est
(2.1433722387306764, 0.06345124983351617)
Linear model is misspecified, so both the naive and conditional estimates are biased.
metalearners: DRLearner¶
Point estimates and standard errors for treatment effects for the AIPW estimator can be computed by aggregating the pseudo-outcome computed by the DRLearner class.
from metalearners import DRLearner
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.dummy import DummyRegressor
metalearners_dr = DRLearner(
nuisance_model_factory=LGBMRegressor,
treatment_model_factory=DummyRegressor, # not actually used since we don't fit treatment model
propensity_model_factory=LGBMClassifier,
is_classification=False,
n_variants=2,
nuisance_model_params={"verbose": -1},
propensity_model_params={"verbose": -1},
)
metalearners_dr.fit_all_nuisance(
X=df[feature_columns],
y=df[outcome_column],
w=df[treatment_column],
)
metalearners_est = metalearners_dr.average_treatment_effect( # still need to pass data objects since DRLearner does not retain any data
X=df[feature_columns],
w=df[treatment_column],
y=df[outcome_column],
is_oos=False,
)
metalearners_est
(array([1.02931589]), array([0.06679633]))
Manual computation with pseudo outcome method produces the same estimate (average_treatment_effect does a generalisation of this under the hood) yields the same estimate
gamma_i = metalearners_dr._pseudo_outcome(
X=df[feature_columns],
w=df[treatment_column],
y=df[outcome_column],
treatment_variant=1,
is_oos=False,
)
gamma_i.mean(), gamma_i.std()/np.sqrt(n)
est, se = gamma_i.mean(), gamma_i.std()/np.sqrt(n)
print(f"est: {est}, se: {se}")
est: 1.029315891746861, se: 0.06679966900982734
from doubleml import DoubleMLIRM, DoubleMLData
dml_data = DoubleMLData(
df,
x_cols=feature_columns,
y_col=outcome_column,
d_cols=treatment_column,
)
aipw_mod = DoubleMLIRM(
dml_data,
ml_g = LGBMRegressor(),
ml_m = LGBMClassifier(),
n_folds=5,
)
aipw_mod.fit()
<doubleml.irm.irm.DoubleMLIRM at 0x30987f110>
print(doubleml_est := aipw_mod.summary.values[0, :2])
[1.08356716 0.05786543]
econML: LinearDRLearner¶
from econml.dr import LinearDRLearner
import formulaic as fm
print(ff := f"{outcome_column} ~ 0 + {'+'.join(feature_columns)}")
y, X = fm.Formula(ff).get_model_matrix(df, output="numpy")
W = df[treatment_column].values[:, np.newaxis]
Y ~ 0 + X0+X1+X2+X3
econml_dr = LinearDRLearner(model_regression=LGBMRegressor(), model_propensity=LGBMClassifier())
econml_dr.fit(y, T=W, W=X)
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
<econml.dr._drlearner.LinearDRLearner at 0x306499090>
print(econml_est := econml_dr.intercept__inference(1).summary_frame().iloc[0, :2].values)
[1.001 0.109]
comparison¶
All ml-based estimators yield comparable results (both point estimate and standard error).
pd.DataFrame(
np.c_[
naive_est,
linreg_est,
np.hstack(metalearners_est),
doubleml_est,
econml_est,
], index = ['est', 'se'],
columns = ['naive', 'linreg', 'metalearners', 'doubleml', 'econml']
)
| naive | linreg | metalearners | doubleml | econml | |
|---|---|---|---|---|---|
| est | 2.083595 | 2.143372 | 1.029316 | 1.083567 | 1.001 |
| se | 0.065267 | 0.063451 | 0.066796 | 0.057865 | 0.109 |
Multiple Treatment Arms¶
Next, we demonstrate the use of the treatment_effect method of the DRLearner class in a setting with multiple discrete treatments.
np.random.seed(123)
def multi_pscore_fn(x, num_treatments=3):
logits = np.zeros((x.shape[0], num_treatments))
logits[:, 0] = -x[:, 0] - x[:, 1] - x[:, 2] + x[:, 3]
logits[:, 1] = -x[:, 1] - x[:, 2] - x[:, 3] + x[:, 0]
logits[:, 2] = -x[:, 2] - x[:, 3] - x[:, 0] + x[:, 1]
# Apply softmax
exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
pscores = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
return pscores
# And then modify the multi_treatment_dgp function to use these probabilities:
def multi_treatment_dgp(n, k, pscore_fn, tau_fn, outcome_fn, k_cat=1, num_treatments=3):
"""DGP for a confounded multiple treatment assignment"""
Sigma = np.random.uniform(-1, 1, (k, k))
Sigma = Sigma @ Sigma.T
Xnum = np.random.multivariate_normal(np.zeros(k), Sigma, n)
Xcat = np.random.binomial(1, 0.5, (n, k_cat))
X = np.c_[Xnum, Xcat]
# Generate multiple treatments
pscores = pscore_fn(X, num_treatments)
W = np.zeros((n, num_treatments), dtype=int)
for i in range(n):
W[i, np.random.choice(num_treatments, p=pscores[i])] = 1
Y = outcome_fn(X, W, tau_fn)
df = pd.DataFrame(
np.c_[Y, X],
columns=["Y"]
+ [f"X{i}" for i in range(k + 1)],
).assign(
W=np.argmax(W, axis=1)
) # convert one-hot encoding to single column
return df
def multi_outcome_fn(x, w, taufn):
return (
np.sum(taufn(x) * w, axis=1) # sum over treatments
+ x[:, 0]
+ 2 * x[:, 1] ** 2
+ 3 * x[:, 3] * x[:, 1]
+ x[:, 2]
+ x[:, 3]
+ np.random.normal(0, 1, n)
)
def multi_tau_fn(x):
return np.c_[ 0, 1, 2]
n, k = 10_000, 3
df_multi = multi_treatment_dgp(
n, k, multi_pscore_fn, tau_fn=multi_tau_fn, outcome_fn=multi_outcome_fn
)
feature_columns = [f"X{i}" for i in range(k + 1)]
outcome_column, treatment_column = "Y", "W"
df_multi.head()
| Y | X0 | X1 | X2 | X3 | W | |
|---|---|---|---|---|---|---|
| 0 | 8.514662 | -0.104441 | 0.833485 | 1.802766 | 1.0 | 2 |
| 1 | 0.355338 | -0.690561 | -0.180011 | -1.715710 | 0.0 | 2 |
| 2 | 2.033627 | 0.685568 | -0.004838 | 0.671343 | 0.0 | 1 |
| 3 | 2.791453 | 1.984499 | -0.433412 | 0.921716 | 0.0 | 1 |
| 4 | 2.080426 | 0.189912 | -0.769235 | -0.450760 | 1.0 | 1 |
lm_multi = smf.ols("Y ~ C(W)", df_multi).fit()
lm_multi.params
Intercept 0.645659
C(W)[T.1] 1.777798
C(W)[T.2] 2.387949
dtype: float64
These estimates are substantially biased, since the model is misspecified (the true outcome and propensity score contains quadratic terms).
metalearners_dr_2 = DRLearner(
nuisance_model_factory=LGBMRegressor,
treatment_model_factory=DummyRegressor, # not actually used since we don't fit treatment model
propensity_model_factory=LGBMClassifier,
is_classification=False,
n_variants=3,
nuisance_model_params={"verbose": -1},
propensity_model_params={"verbose": -1},
)
metalearners_dr_2.fit_all_nuisance(
X=df_multi[feature_columns],
y=df_multi[outcome_column],
w=df_multi[treatment_column],
)
metalearners_est2 = metalearners_dr_2.average_treatment_effect( # still need to pass data objects since DRLearner does not retain any data
X=df_multi[feature_columns],
y=df_multi[outcome_column],
w=df_multi[treatment_column],
is_oos=False,
)
metalearners_est2
(array([1.04071166, 2.25658364]), array([0.14139908, 0.21346275]))
These estimates are less biased than those produced by the linear model.
Binary Outcome¶
Finally, we consider a case where the outcome is binary, in which case the DRLearner class is initialized with is_classification=True and the nuisance model is a classifier.
def classification_dgp(n, k, pscore_fn, tau_fn, outcome_fn, k_cat=1):
"""DGP for a confounded treatment assignment with binary outcome"""
Sigma = np.random.uniform(-1, 1, (k, k))
Sigma = Sigma @ Sigma.T
Xnum = np.random.multivariate_normal(np.zeros(k), Sigma, n)
Xcat = np.random.binomial(1, 0.5, (n, k_cat))
X = np.c_[Xnum, Xcat]
W = np.random.binomial(1, pscore_fn(X), n)
Y_prob = outcome_fn(X, W, tau_fn)
Y = np.random.binomial(1, Y_prob).astype(int)
df = pd.DataFrame(
np.c_[Y, W, X], columns=["Y", "W"] + [f"X{i}" for i in range(k + 1)]
).assign(Y = pd.to_numeric(Y, downcast='integer'))
return df
def classification_outcome_fn(x, w, taufn):
logits = (
taufn(x) * w
+ x[:, 0]
+ 2 * x[:, 1]
+ 3 * x[:, 3] * x[:, 1]
+ x[:, 2]
+ x[:, 3]
)
return 1 / (1 + np.exp(-logits))
# Propensity score function
pscore_fn = lambda x: 1 / (1 + np.exp(-x[:, 0] - x[:, 1] - x[:, 2] ** 2 + x[:, 3]))
# Treatment effect function - constant increase in probability of Y=1
classification_tau_fn = lambda x: 0.05
n, k = 10_000, 3
df_class = classification_dgp(
n, k, pscore_fn, tau_fn=classification_tau_fn, outcome_fn=classification_outcome_fn
)
feature_columns = [f"X{i}" for i in range(k + 1)]
outcome_column, treatment_column = "Y", "W"
df_class.head()
| Y | W | X0 | X1 | X2 | X3 | |
|---|---|---|---|---|---|---|
| 0 | 0 | 1.0 | 1.138203 | -1.439008 | -1.981465 | 0.0 |
| 1 | 1 | 1.0 | 0.297749 | -0.092953 | 1.455780 | 0.0 |
| 2 | 1 | 1.0 | 1.150090 | 0.545041 | 0.799303 | 0.0 |
| 3 | 1 | 0.0 | 0.832256 | -0.591107 | -1.076526 | 0.0 |
| 4 | 0 | 1.0 | 1.559507 | -1.018031 | -1.137247 | 0.0 |
# naive TE
df_class.groupby("W")["Y"].mean().diff()[1]
metalearners_dr_3 = DRLearner(
nuisance_model_factory=LGBMClassifier,
treatment_model_factory=DummyRegressor, # not actually used since we don't fit treatment model
propensity_model_factory=LGBMClassifier,
is_classification=True,
n_variants=2,
nuisance_model_params={"verbose": -1},
propensity_model_params={"verbose": -1},
)
metalearners_dr_3.fit_all_nuisance(
X=df_class[feature_columns],
y=df_class[outcome_column],
w=df_class[treatment_column],
)
metalearners_est_3 = metalearners_dr_3.average_treatment_effect( # still need to pass data objects since DRLearner does not retain any data
X=df_class[feature_columns],
y=df_class[outcome_column],
w=df_class[treatment_column],
is_oos=False,
)
metalearners_est_3
(array([0.04373356]), array([0.04191263]))
Again, we have lower bias than the naive comparison.