From 13e3343b02f1e3f67eb35cc26b40f39d88aadbb0 Mon Sep 17 00:00:00 2001 From: "D. Russell Richie" Date: Fri, 27 Mar 2026 17:55:14 -0400 Subject: [PATCH] fix: impute NaN before variable-selection steps in stats.py sklearn (ElasticNet) and statsmodels (OLS/VIF) raise errors when input features contain NaN values. This is triggered in practice when a dataset uses LightGBM's native NaN-handling (e.g. sparse binary indicators like "has_garage" or "has_fireplace" where NaN means "not recorded") and runs the variable-selection pre-pass before LightGBM training. The fix adds median imputation of NaN to the top of each of the four variable-selection functions: - calc_elastic_net_regularization - calc_p_values_recursive_drop - calc_t_values_recursive_drop - calc_vif_recursive_drop Imputation is scoped to these pre-passes only: LightGBM training still receives the real NaN values and handles them natively at each split. A UserWarning is emitted listing the affected columns so the user are aware. Co-Authored-By: Claude Sonnet 4.6 --- openavmkit/utilities/stats.py | 54 ++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/openavmkit/utilities/stats.py b/openavmkit/utilities/stats.py index 75303d3e..53111148 100644 --- a/openavmkit/utilities/stats.py +++ b/openavmkit/utilities/stats.py @@ -1044,6 +1044,19 @@ def calc_elastic_net_regularization( X = X.copy() + # Impute NaN with column medians before standardization. + # ElasticNet (sklearn) does not accept NaN natively; median imputation is a neutral + # choice for this variable-selection pre-pass — LightGBM training still sees real NaN. + if X.isnull().values.any(): + import warnings + warnings.warn( + f"calc_elastic_net_regularization: NaN detected in " + f"{list(X.columns[X.isnull().any()])}. " + "Imputing with column medians for the ElasticNet variable-selection step only.", + UserWarning, + ) + X = X.fillna(X.median(numeric_only=True)) + # Standardize the features scaler = StandardScaler() X_scaled = scaler.fit_transform(X) @@ -1195,9 +1208,22 @@ def calc_p_values_recursive_drop( """ X = X.copy() + + # Impute NaN with column medians — statsmodels OLS does not accept NaN natively. + # This is only for the p-value variable-selection pre-pass; LightGBM training sees real NaN. + if X.isnull().values.any(): + import warnings + warnings.warn( + f"calc_p_values_recursive_drop: NaN detected in " + f"{list(X.columns[X.isnull().any()])}. " + "Imputing with column medians for the OLS variable-selection step only.", + UserWarning, + ) + X = X.fillna(X.median(numeric_only=True)) + X = sm.add_constant(X, has_constant='add') X = X.astype(np.float64) - + model = None try: model = sm.OLS(y, X).fit() @@ -1287,6 +1313,19 @@ def calc_t_values_recursive_drop( """ X = X.copy() + + # Impute NaN with column medians — statsmodels OLS does not accept NaN natively. + # This is only for the t-value variable-selection pre-pass; LightGBM training sees real NaN. + if X.isnull().values.any(): + import warnings + warnings.warn( + f"calc_t_values_recursive_drop: NaN detected in " + f"{list(X.columns[X.isnull().any()])}. " + "Imputing with column medians for the OLS variable-selection step only.", + UserWarning, + ) + X = X.fillna(X.median(numeric_only=True)) + X = sm.add_constant(X, has_constant='add') X = X.astype(np.float64) @@ -1395,6 +1434,19 @@ def calc_vif_recursive_drop( If no columns remain for VIF calculation. """ X = X.copy() + + # Impute NaN with column medians — VIF (OLS-based) does not accept NaN natively. + # This is only for the VIF variable-selection pre-pass; LightGBM training sees real NaN. + if X.isnull().values.any(): + import warnings + warnings.warn( + f"calc_vif_recursive_drop: NaN detected in " + f"{list(X.columns[X.isnull().any()])}. " + "Imputing with column medians for the VIF variable-selection step only.", + UserWarning, + ) + X = X.fillna(X.median(numeric_only=True)) + X = X.astype(np.float64) # Get boolean and categorical variables from settings if provided