Skip to content

Commit 2cc9f62

Browse files
author
Miruna Oprescu
authored
Fix a few 0.9.0 bugs (#422)
* Relax sklearn requirement `>=0.24.0` --> `>0.22.0` * Make `cate_feature_names` more robust to different featurizers * Address PR comments * Update name featurization in shap
1 parent 5e31584 commit 2cc9f62

File tree

9 files changed

+74
-39
lines changed

9 files changed

+74
-39
lines changed

econml/_cate_estimator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def _prefit(self, Y, T, *args, **kwargs):
7777
self._d_t = np.shape(T)[1:]
7878
# This works only if X is passed as a kwarg
7979
# We plan to enforce X as kwarg only in future releases
80-
if not hasattr(self, "_input_names_set"):
80+
if not hasattr(self, "_input_names_set") or not self._input_names_set:
8181
# This checks if names have been set in a child class
8282
# If names were set in a child class, don't do it again
8383
X = kwargs.get('X')

econml/_shap.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import shap
1616
from collections import defaultdict
1717
import numpy as np
18-
from .utilities import broadcast_unit_treatments, cross_product
18+
from .utilities import broadcast_unit_treatments, cross_product, get_feature_names_or_default
1919

2020

2121
def _shap_explain_cme(cme_model, X, d_t, d_y,
@@ -392,9 +392,6 @@ def _define_names(d_t, d_y, treatment_names, output_names, feature_names, input_
392392
feature_names = input_names['feature_names']
393393
if featurizer is None:
394394
transformed_feature_names = feature_names
395-
elif featurizer is not None and hasattr(featurizer, 'get_feature_names'):
396-
transformed_feature_names = featurizer.get_feature_names(feature_names)
397395
else:
398-
transformed_feature_names = None
399-
396+
transformed_feature_names = get_feature_names_or_default(featurizer, feature_names)
400397
return (d_t, d_y, treatment_names, output_names, feature_names, transformed_feature_names)

econml/dml/dml.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@
3131
broadcast_unit_treatments, check_high_dimensional,
3232
cross_product, deprecated, fit_with_groups,
3333
hstack, inverse_onehot, ndim, reshape,
34-
reshape_treatmentwise_effects, shape, transpose)
34+
reshape_treatmentwise_effects, shape, transpose,
35+
get_feature_names_or_default)
3536
from .._shap import _shap_explain_model_cate
3637

3738

@@ -281,11 +282,7 @@ def cate_feature_names(self, feature_names=None):
281282
feature_names = self._input_names["feature_names"]
282283
if self.original_featurizer is None:
283284
return feature_names
284-
elif hasattr(self.original_featurizer, 'get_feature_names'):
285-
# This fails if X=None and featurizer is not None, but that case is handled above
286-
return self.original_featurizer.get_feature_names(feature_names)
287-
else:
288-
raise AttributeError("Featurizer does not have a method: get_feature_names!")
285+
return get_feature_names_or_default(self.original_featurizer, feature_names)
289286

290287

291288
class DML(LinearModelFinalCateEstimatorMixin, _BaseDML):

econml/dr/_drlearner.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
from ..sklearn_extensions.linear_model import (
5353
DebiasedLasso, StatsModelsLinearRegression, WeightedLassoCVWrapper)
5454
from ..utilities import (_deprecate_positional, check_high_dimensional,
55-
filter_none_kwargs, fit_with_groups, inverse_onehot)
55+
filter_none_kwargs, fit_with_groups, inverse_onehot, get_feature_names_or_default)
5656
from .._shap import _shap_explain_multitask_model_cate, _shap_explain_model_cate
5757

5858

@@ -631,11 +631,7 @@ def cate_feature_names(self, feature_names=None):
631631
feature_names = self._input_names["feature_names"]
632632
if self.featurizer_ is None:
633633
return feature_names
634-
elif hasattr(self.featurizer_, 'get_feature_names'):
635-
# This fails if X=None and featurizer is not None, but that case is handled above
636-
return self.featurizer_.get_feature_names(feature_names)
637-
else:
638-
raise AttributeError("Featurizer does not have a method: get_feature_names!")
634+
return get_feature_names_or_default(self.featurizer_, feature_names)
639635

640636
@property
641637
def model_final_(self):

econml/iv/dml/_dml.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from ..._cate_estimator import LinearModelFinalCateEstimatorMixin, StatsModelsCateEstimatorMixin
2424
from ...inference import StatsModelsInference
2525
from ...sklearn_extensions.linear_model import StatsModelsLinearRegression
26-
from ...utilities import _deprecate_positional
26+
from ...utilities import _deprecate_positional, get_feature_names_or_default
2727
from .._nuisance_wrappers import _FirstStageWrapper, _FinalWrapper
2828

2929

@@ -676,10 +676,7 @@ def cate_feature_names(self, feature_names=None):
676676
feature_names = self._input_names["feature_names"]
677677
if self.original_featurizer is None:
678678
return feature_names
679-
elif hasattr(self.original_featurizer, 'get_feature_names'):
680-
return self.original_featurizer.get_feature_names(feature_names)
681-
else:
682-
raise AttributeError("Featurizer does not have a method: get_feature_names!")
679+
return get_feature_names_or_default(self.original_featurizer, feature_names)
683680

684681

685682
class DMLIV(LinearModelFinalCateEstimatorMixin, _BaseDMLIV):

econml/iv/dr/_dr.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from ...inference import StatsModelsInference
2525
from ...sklearn_extensions.linear_model import StatsModelsLinearRegression
2626
from ...utilities import (_deprecate_positional, add_intercept, filter_none_kwargs,
27-
inverse_onehot)
27+
inverse_onehot, get_feature_names_or_default)
2828
from .._nuisance_wrappers import _FirstStageWrapper, _FinalWrapper
2929

3030

@@ -354,10 +354,7 @@ def cate_feature_names(self, feature_names=None):
354354
feature_names = self._input_names["feature_names"]
355355
if self.original_featurizer is None:
356356
return feature_names
357-
elif hasattr(self.original_featurizer, 'get_feature_names'):
358-
return self.original_featurizer.get_feature_names(feature_names)
359-
else:
360-
raise AttributeError("Featurizer does not have a method: get_feature_names!")
357+
return get_feature_names_or_default(self.original_featurizer, feature_names)
361358

362359

363360
class _IntentToTreatDRIVModelNuisance:

econml/tests/test_integration.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212
from econml.ortho_forest import DMLOrthoForest, DROrthoForest
1313
from econml.sklearn_extensions.linear_model import WeightedLasso
1414
from econml.metalearners import XLearner, SLearner, TLearner
15+
from sklearn.compose import ColumnTransformer
1516
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
1617
from sklearn.linear_model import LinearRegression, MultiTaskLasso, LassoCV
18+
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
1719
from econml.ortho_iv import LinearIntentToTreatDRIV
1820
from econml.deepiv import DeepIVEstimator
1921

@@ -64,7 +66,23 @@ def test_dml(self):
6466
treatment_effects = est.effect(X)
6567
lb, ub = est.effect_interval(X, alpha=0.05)
6668
self._check_input_names(est.summary()) # Check that names propagate as expected
67-
# Test re-fit
69+
# |--> Test featurizers
70+
est.featurizer = PolynomialFeatures(degree=2, include_bias=False)
71+
est.fit(Y, T, X=X, W=W, inference='statsmodels')
72+
self._check_input_names(
73+
est.summary(),
74+
feat_comp=est.original_featurizer.get_feature_names(X.columns))
75+
est.featurizer = FunctionTransformer()
76+
est.fit(Y, T, X=X, W=W, inference='statsmodels')
77+
self._check_input_names(
78+
est.summary(),
79+
feat_comp=[f"feat(X){i}" for i in range(TestPandasIntegration.n_features)])
80+
est.featurizer = ColumnTransformer([('passthrough', 'passthrough', [0])])
81+
est.fit(Y, T, X=X, W=W, inference='statsmodels')
82+
# ColumnTransformer doesn't propagate column names
83+
self._check_input_names(est.summary(), feat_comp=["x0"])
84+
# |--> Test re-fit
85+
est.featurizer = None
6886
X1 = X.rename(columns={c: "{}_1".format(c) for c in X.columns})
6987
est.fit(Y, T, X=X1, W=W, inference='statsmodels')
7088
self._check_input_names(est.summary(), feat_comp=X1.columns)
@@ -74,7 +92,7 @@ def test_dml(self):
7492
treatment_effects = est.effect(X)
7593
lb, ub = est.effect_interval(X, alpha=0.05)
7694
self._check_input_names(est.summary()) # Check that names propagate as expected
77-
# ForestDML
95+
# Test ForestDML
7896
est = ForestDML(model_y=GradientBoostingRegressor(), model_t=GradientBoostingRegressor())
7997
est.fit(Y, T, X=X, W=W, inference='blb')
8098
treatment_effects = est.effect(X)

econml/utilities.py

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
import scipy.sparse
99
import sparse as sp
1010
import itertools
11+
import inspect
1112
from operator import getitem
1213
from collections import defaultdict, Counter
1314
from sklearn import clone
1415
from sklearn.base import TransformerMixin, BaseEstimator
1516
from sklearn.linear_model import LassoCV, MultiTaskLassoCV, Lasso, MultiTaskLasso
1617
from functools import reduce, wraps
1718
from sklearn.utils import check_array, check_X_y
19+
from sklearn.utils.validation import assert_all_finite
1820
import warnings
1921
from warnings import warn
2022
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
@@ -512,7 +514,7 @@ def check_inputs(Y, T, X, W=None, multi_output_T=True, multi_output_Y=True):
512514
return Y, T, X, W
513515

514516

515-
def check_input_arrays(*args, validate_len=True):
517+
def check_input_arrays(*args, validate_len=True, force_all_finite=True):
516518
"""Cast input sequences into numpy arrays.
517519
518520
Only inputs that are sequence-like will be converted, all other inputs will be left as is.
@@ -526,23 +528,35 @@ def check_input_arrays(*args, validate_len=True):
526528
validate_len : bool (default=True)
527529
Whether to check if the input arrays have the same length.
528530
531+
force_all_finite : bool (default=True)
532+
Whether to allow inf and nan in input arrays.
533+
529534
Returns
530535
-------
531536
args: array-like
532537
List of inputs where sequence-like objects have been cast to numpy arrays.
533538
534539
"""
535-
args = [check_array(arg, dtype=None, ensure_2d=False, accept_sparse=True)
536-
if np.ndim(arg) > 0 else arg for arg in args]
537-
if validate_len:
538-
n = None
539-
for arg in args:
540-
if np.ndim(arg) > 0:
541-
m = arg.shape[0]
540+
n = None
541+
args = list(args)
542+
for i, arg in enumerate(args):
543+
if np.ndim(arg) > 0:
544+
new_arg = check_array(arg, dtype=None, ensure_2d=False, accept_sparse=True,
545+
force_all_finite=force_all_finite)
546+
if not force_all_finite:
547+
# For when checking input values is disabled
548+
try:
549+
assert_all_finite(new_arg)
550+
except ValueError:
551+
warnings.warn("Input contains NaN, infinity or a value too large for dtype('float64') "
552+
"but input check is disabled. Check the inputs before proceeding.")
553+
if validate_len:
554+
m = new_arg.shape[0]
542555
if n is None:
543556
n = m
544557
else:
545558
assert (m == n), "Input arrays have incompatible lengths: {} and {}".format(n, m)
559+
args[i] = new_arg
546560
return args
547561

548562

@@ -582,6 +596,25 @@ def get_input_columns(X, prefix="X"):
582596
return [f"{prefix}{i}" for i in range(len_X)]
583597

584598

599+
def get_feature_names_or_default(featurizer, feature_names):
600+
if hasattr(featurizer, 'get_feature_names'):
601+
# Get number of arguments, some sklearn featurizer don't accept feature_names
602+
arg_no = len(inspect.getfullargspec(featurizer.get_feature_names).args)
603+
if arg_no == 1:
604+
return featurizer.get_feature_names()
605+
elif arg_no == 2:
606+
return featurizer.get_feature_names(feature_names)
607+
# Featurizer doesn't have 'get_feature_names' or has atypical 'get_feature_names'
608+
try:
609+
# Get feature names using featurizer
610+
dummy_X = np.ones((1, len(feature_names)))
611+
return get_input_columns(featurizer.transform(dummy_X), prefix="feat(X)")
612+
except Exception:
613+
# All attempts at retrieving transformed feature names have failed
614+
# Delegate handling to downstream logic
615+
return None
616+
617+
585618
def check_models(models, n):
586619
"""
587620
Input validation for metalearner models.

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ packages = find_namespace:
3333
install_requires =
3434
numpy
3535
scipy > 1.4.0
36-
scikit-learn >= 0.24
36+
scikit-learn > 0.22.0
3737
sparse
3838
joblib >= 0.13.0
3939
numba != 0.42.1

0 commit comments

Comments
 (0)