Example: Using Sparse Covariate Matrices¶
Motivation¶
In many applications, we want to adjust for categorical covariates with many levels. As a natural pre-processing step, this may involve one-hot-encoding the covariates, which can lead to a high-dimensional covariate matrix, which is typically very sparse. Many scikit-style learners accept (scipy's) sparse matrices as input, which allows us to use them for treatment effect estimation as well.
Example¶
import time, psutil, os, gc
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor, LGBMClassifier
from metalearners import DRLearner
# This is required for when nbconvert converts the cell-magic to regular function calls.
from IPython import get_ipython
def get_memory_usage():
process = psutil.Process(os.getpid())
return process.memory_info().rss / 1024 / 1024 # in MB
We generate some data where X comprises of 100 categorical variables with 1000 possible levels. Naively one-hot-encoding this data produces a very large matrix with many zeroes, which is an ideal application of scipy.sparse.csr_matrix. We then use the DRLearner to estimate the treatment effect.
def generate_causal_data(
n_samples=100_000,
n_categories=500,
n_features=100,
tau_magnitude=1.0,
):
######################################################################
# Generate covariate matrix X
X = np.random.randint(0, n_categories, size=(n_samples, n_features))
######################################################################
# Generate potential outcome y0
y0 = np.zeros(n_samples)
# Select a few features for main effects
main_effect_features = np.random.choice(n_features, 3, replace=False)
# Create main effects - fully dense
for i in main_effect_features:
category_effects = np.random.normal(0, 4, n_categories)
y0 += category_effects[X[:, i]]
# Select a couple of feature pairs for interaction effects
interaction_pairs = [
(i, j) for i in range(n_features) for j in range(i + 1, n_features)
]
selected_interactions = np.random.choice(len(interaction_pairs), 2, replace=False)
# Create interaction effects
for idx in selected_interactions:
i, j = interaction_pairs[idx]
interaction_effect = np.random.choice(
[-1, 0, 1], size=(n_categories, n_categories), p=[0.25, 0.5, 0.25]
)
y0 += interaction_effect[X[:, i], X[:, j]]
# Normalize y0
y0 = (y0 - np.mean(y0)) / np.std(y0)
y0 += np.random.normal(0, 0.1, n_samples)
######################################################################
# Generate treatment assignment W
propensity_score = np.zeros(n_samples)
for i in main_effect_features:
category_effects = np.random.normal(0, 4, n_categories)
propensity_score += category_effects[X[:, i]]
# same interactions enter pscore
# Create interaction effects
for idx in selected_interactions:
i, j = interaction_pairs[idx]
interaction_effect = np.random.choice(
[-1, 0, 1], size=(n_categories, n_categories), p=[0.25, 0.5, 0.25]
)
propensity_score += interaction_effect[X[:, i], X[:, j]]
# Convert to probabilities using logistic function
propensity_score = sp.special.expit(propensity_score)
# Generate binary treatment
W = np.random.binomial(1, propensity_score)
######################################################################
# Generate treatment effect
tau = tau_magnitude * np.ones(n_samples)
# Generate final outcome
Y = y0 + W * tau
return X, W, Y, tau, propensity_score
X, W, Y, tau, propensity_score = generate_causal_data(
n_samples=1000, tau_magnitude=1.0
)
Xdf = pd.DataFrame(X)
# sparse and dense X matrices
e1 = OneHotEncoder(
sparse_output=True
) # onehot encoder generates sparse output automatically
X_csr = e1.fit_transform(X)
X_np = pd.get_dummies(
Xdf, columns=Xdf.columns
).values # dense onehot encoding with pandas
print(f"\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB")
print(f"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB")
Sparse data memory: 0.76MB
Dense data memory: 41.25MB
As expected, the memory footprint of the sparse matrix is considerably smaller than the dense matrix.
def fit_drlearner_wrapper(X, name):
start_memory = get_memory_usage()
start_time = time.time()
metalearners_dr = DRLearner(
nuisance_model_factory=LGBMRegressor,
treatment_model_factory=DummyRegressor,
propensity_model_factory=LGBMClassifier,
is_classification=False,
n_variants=2,
nuisance_model_params={"verbose": -1},
propensity_model_params={"verbose": -1},
)
metalearners_dr.fit_all_nuisance(
X=X,
y=Y,
w=W,
)
metalearners_est = metalearners_dr.average_treatment_effect(
X=X,
y=Y,
w=W,
is_oos=False,
)
end_time = time.time()
end_memory = get_memory_usage()
runtime = end_time - start_time
memory_used = end_memory - start_memory
print(f"{name} data - Runtime: {runtime:.2f}s, Memory used: {memory_used:.2f}MB")
print(metalearners_est)
scipy.sparse.csr_matrix input
fit_drlearner_wrapper(X_csr, "Sparse")
Sparse data - Runtime: 2.35s, Memory used: 99.75MB
(array([0.97906962]), array([0.06407002]))
np.ndarray input
fit_drlearner_wrapper(X_np, "Dense")
Dense data - Runtime: 4.42s, Memory used: 812.55MB
(array([0.97980121]), array([0.06401631]))