Fix a few 0.9.0 bugs (#422)

Miruna Oprescu · web-flow · commit 2cc9f620e6cf · 2021-03-02T19:55:38.000-05:00
* Relax sklearn requirement `&gt;=0.24.0` --&gt; `&gt;0.22.0`

* Make `cate_feature_names` more robust to different featurizers

* Address PR comments

* Update name featurization in shap
diff --git a/econml/_cate_estimator.py b/econml/_cate_estimator.py
@@ -77,7 +77,7 @@ def _prefit(self, Y, T, *args, **kwargs):
         self._d_t = np.shape(T)[1:]
         # This works only if X is passed as a kwarg
         # We plan to enforce X as kwarg only in future releases
-        if not hasattr(self, "_input_names_set"):
+        if not hasattr(self, "_input_names_set") or not self._input_names_set:
             # This checks if names have been set in a child class
             # If names were set in a child class, don't do it again
             X = kwargs.get('X')
diff --git a/econml/_shap.py b/econml/_shap.py
@@ -15,7 +15,7 @@
 import shap
 from collections import defaultdict
 import numpy as np
-from .utilities import broadcast_unit_treatments, cross_product
+from .utilities import broadcast_unit_treatments, cross_product, get_feature_names_or_default
 
 
 def _shap_explain_cme(cme_model, X, d_t, d_y,
@@ -392,9 +392,6 @@ def _define_names(d_t, d_y, treatment_names, output_names, feature_names, input_
         feature_names = input_names['feature_names']
     if featurizer is None:
         transformed_feature_names = feature_names
-    elif featurizer is not None and hasattr(featurizer, 'get_feature_names'):
-        transformed_feature_names = featurizer.get_feature_names(feature_names)
     else:
-        transformed_feature_names = None
-
+        transformed_feature_names = get_feature_names_or_default(featurizer, feature_names)
     return (d_t, d_y, treatment_names, output_names, feature_names, transformed_feature_names)
diff --git a/econml/dml/dml.py b/econml/dml/dml.py
@@ -31,7 +31,8 @@
                          broadcast_unit_treatments, check_high_dimensional,
                          cross_product, deprecated, fit_with_groups,
                          hstack, inverse_onehot, ndim, reshape,
-                         reshape_treatmentwise_effects, shape, transpose)
+                         reshape_treatmentwise_effects, shape, transpose,
+                         get_feature_names_or_default)
 from .._shap import _shap_explain_model_cate
 
 
@@ -281,11 +282,7 @@ def cate_feature_names(self, feature_names=None):
             feature_names = self._input_names["feature_names"]
         if self.original_featurizer is None:
             return feature_names
-        elif hasattr(self.original_featurizer, 'get_feature_names'):
-            # This fails if X=None and featurizer is not None, but that case is handled above
-            return self.original_featurizer.get_feature_names(feature_names)
-        else:
-            raise AttributeError("Featurizer does not have a method: get_feature_names!")
+        return get_feature_names_or_default(self.original_featurizer, feature_names)
 
 
 class DML(LinearModelFinalCateEstimatorMixin, _BaseDML):
diff --git a/econml/dr/_drlearner.py b/econml/dr/_drlearner.py
@@ -52,7 +52,7 @@
 from ..sklearn_extensions.linear_model import (
     DebiasedLasso, StatsModelsLinearRegression, WeightedLassoCVWrapper)
 from ..utilities import (_deprecate_positional, check_high_dimensional,
-                         filter_none_kwargs, fit_with_groups, inverse_onehot)
+                         filter_none_kwargs, fit_with_groups, inverse_onehot, get_feature_names_or_default)
 from .._shap import _shap_explain_multitask_model_cate, _shap_explain_model_cate
 
 
@@ -631,11 +631,7 @@ def cate_feature_names(self, feature_names=None):
             feature_names = self._input_names["feature_names"]
         if self.featurizer_ is None:
             return feature_names
-        elif hasattr(self.featurizer_, 'get_feature_names'):
-            # This fails if X=None and featurizer is not None, but that case is handled above
-            return self.featurizer_.get_feature_names(feature_names)
-        else:
-            raise AttributeError("Featurizer does not have a method: get_feature_names!")
+        return get_feature_names_or_default(self.featurizer_, feature_names)
 
     @property
     def model_final_(self):
diff --git a/econml/iv/dml/_dml.py b/econml/iv/dml/_dml.py
@@ -23,7 +23,7 @@
 from ..._cate_estimator import LinearModelFinalCateEstimatorMixin, StatsModelsCateEstimatorMixin
 from ...inference import StatsModelsInference
 from ...sklearn_extensions.linear_model import StatsModelsLinearRegression
-from ...utilities import _deprecate_positional
+from ...utilities import _deprecate_positional, get_feature_names_or_default
 from .._nuisance_wrappers import _FirstStageWrapper, _FinalWrapper
 
 
@@ -676,10 +676,7 @@ def cate_feature_names(self, feature_names=None):
             feature_names = self._input_names["feature_names"]
         if self.original_featurizer is None:
             return feature_names
-        elif hasattr(self.original_featurizer, 'get_feature_names'):
-            return self.original_featurizer.get_feature_names(feature_names)
-        else:
-            raise AttributeError("Featurizer does not have a method: get_feature_names!")
+        return get_feature_names_or_default(self.original_featurizer, feature_names)
 
 
 class DMLIV(LinearModelFinalCateEstimatorMixin, _BaseDMLIV):
diff --git a/econml/iv/dr/_dr.py b/econml/iv/dr/_dr.py
@@ -24,7 +24,7 @@
 from ...inference import StatsModelsInference
 from ...sklearn_extensions.linear_model import StatsModelsLinearRegression
 from ...utilities import (_deprecate_positional, add_intercept, filter_none_kwargs,
-                          inverse_onehot)
+                          inverse_onehot, get_feature_names_or_default)
 from .._nuisance_wrappers import _FirstStageWrapper, _FinalWrapper
 
 
@@ -354,10 +354,7 @@ def cate_feature_names(self, feature_names=None):
             feature_names = self._input_names["feature_names"]
         if self.original_featurizer is None:
             return feature_names
-        elif hasattr(self.original_featurizer, 'get_feature_names'):
-            return self.original_featurizer.get_feature_names(feature_names)
-        else:
-            raise AttributeError("Featurizer does not have a method: get_feature_names!")
+        return get_feature_names_or_default(self.original_featurizer, feature_names)
 
 
 class _IntentToTreatDRIVModelNuisance:
diff --git a/econml/tests/test_integration.py b/econml/tests/test_integration.py
@@ -12,8 +12,10 @@
 from econml.ortho_forest import DMLOrthoForest, DROrthoForest
 from econml.sklearn_extensions.linear_model import WeightedLasso
 from econml.metalearners import XLearner, SLearner, TLearner
+from sklearn.compose import ColumnTransformer
 from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
 from sklearn.linear_model import LinearRegression, MultiTaskLasso, LassoCV
+from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
 from econml.ortho_iv import LinearIntentToTreatDRIV
 from econml.deepiv import DeepIVEstimator
 
@@ -64,7 +66,23 @@ def test_dml(self):
         treatment_effects = est.effect(X)
         lb, ub = est.effect_interval(X, alpha=0.05)
         self._check_input_names(est.summary())  # Check that names propagate as expected
-        # Test re-fit
+        # |--> Test featurizers
+        est.featurizer = PolynomialFeatures(degree=2, include_bias=False)
+        est.fit(Y, T, X=X, W=W, inference='statsmodels')
+        self._check_input_names(
+            est.summary(),
+            feat_comp=est.original_featurizer.get_feature_names(X.columns))
+        est.featurizer = FunctionTransformer()
+        est.fit(Y, T, X=X, W=W, inference='statsmodels')
+        self._check_input_names(
+            est.summary(),
+            feat_comp=[f"feat(X){i}" for i in range(TestPandasIntegration.n_features)])
+        est.featurizer = ColumnTransformer([('passthrough', 'passthrough', [0])])
+        est.fit(Y, T, X=X, W=W, inference='statsmodels')
+        # ColumnTransformer doesn't propagate column names
+        self._check_input_names(est.summary(), feat_comp=["x0"])
+        # |--> Test re-fit
+        est.featurizer = None
         X1 = X.rename(columns={c: "{}_1".format(c) for c in X.columns})
         est.fit(Y, T, X=X1, W=W, inference='statsmodels')
         self._check_input_names(est.summary(), feat_comp=X1.columns)
@@ -74,7 +92,7 @@ def test_dml(self):
         treatment_effects = est.effect(X)
         lb, ub = est.effect_interval(X, alpha=0.05)
         self._check_input_names(est.summary())  # Check that names propagate as expected
-        # ForestDML
+        # Test ForestDML
         est = ForestDML(model_y=GradientBoostingRegressor(), model_t=GradientBoostingRegressor())
         est.fit(Y, T, X=X, W=W, inference='blb')
         treatment_effects = est.effect(X)
diff --git a/econml/utilities.py b/econml/utilities.py
@@ -8,13 +8,15 @@
 import scipy.sparse
 import sparse as sp
 import itertools
+import inspect
 from operator import getitem
 from collections import defaultdict, Counter
 from sklearn import clone
 from sklearn.base import TransformerMixin, BaseEstimator
 from sklearn.linear_model import LassoCV, MultiTaskLassoCV, Lasso, MultiTaskLasso
 from functools import reduce, wraps
 from sklearn.utils import check_array, check_X_y
+from sklearn.utils.validation import assert_all_finite
 import warnings
 from warnings import warn
 from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
@@ -512,7 +514,7 @@ def check_inputs(Y, T, X, W=None, multi_output_T=True, multi_output_Y=True):
     return Y, T, X, W
 
 
-def check_input_arrays(*args, validate_len=True):
+def check_input_arrays(*args, validate_len=True, force_all_finite=True):
     """Cast input sequences into numpy arrays.
 
     Only inputs that are sequence-like will be converted, all other inputs will be left as is.
@@ -526,23 +528,35 @@ def check_input_arrays(*args, validate_len=True):
     validate_len : bool (default=True)
         Whether to check if the input arrays have the same length.
 
+    force_all_finite : bool (default=True)
+        Whether to allow inf and nan in input arrays.
+
     Returns
     -------
     args: array-like
         List of inputs where sequence-like objects have been cast to numpy arrays.
 
     """
-    args = [check_array(arg, dtype=None, ensure_2d=False, accept_sparse=True)
-            if np.ndim(arg) > 0 else arg for arg in args]
-    if validate_len:
-        n = None
-        for arg in args:
-            if np.ndim(arg) > 0:
-                m = arg.shape[0]
+    n = None
+    args = list(args)
+    for i, arg in enumerate(args):
+        if np.ndim(arg) > 0:
+            new_arg = check_array(arg, dtype=None, ensure_2d=False, accept_sparse=True,
+                                  force_all_finite=force_all_finite)
+            if not force_all_finite:
+                # For when checking input values is disabled
+                try:
+                    assert_all_finite(new_arg)
+                except ValueError:
+                    warnings.warn("Input contains NaN, infinity or a value too large for dtype('float64') "
+                                  "but input check is disabled. Check the inputs before proceeding.")
+            if validate_len:
+                m = new_arg.shape[0]
                 if n is None:
                     n = m
                 else:
                     assert (m == n), "Input arrays have incompatible lengths: {} and {}".format(n, m)
+            args[i] = new_arg
     return args
 
 
@@ -582,6 +596,25 @@ def get_input_columns(X, prefix="X"):
     return [f"{prefix}{i}" for i in range(len_X)]
 
 
+def get_feature_names_or_default(featurizer, feature_names):
+    if hasattr(featurizer, 'get_feature_names'):
+        # Get number of arguments, some sklearn featurizer don't accept feature_names
+        arg_no = len(inspect.getfullargspec(featurizer.get_feature_names).args)
+        if arg_no == 1:
+            return featurizer.get_feature_names()
+        elif arg_no == 2:
+            return featurizer.get_feature_names(feature_names)
+    # Featurizer doesn't have 'get_feature_names' or has atypical 'get_feature_names'
+    try:
+        # Get feature names using featurizer
+        dummy_X = np.ones((1, len(feature_names)))
+        return get_input_columns(featurizer.transform(dummy_X), prefix="feat(X)")
+    except Exception:
+        # All attempts at retrieving transformed feature names have failed
+        # Delegate handling to downstream logic
+        return None
+
+
 def check_models(models, n):
     """
     Input validation for metalearner models.
diff --git a/setup.cfg b/setup.cfg
@@ -33,7 +33,7 @@ packages = find_namespace:
 install_requires =
     numpy
     scipy > 1.4.0
-    scikit-learn >= 0.24
+    scikit-learn > 0.22.0
     sparse
     joblib >= 0.13.0
     numba != 0.42.1