diff --git a/openavmkit/benchmark.py b/openavmkit/benchmark.py index 69378ddb..5419ccef 100644 --- a/openavmkit/benchmark.py +++ b/openavmkit/benchmark.py @@ -36,6 +36,7 @@ run_lightgbm, run_catboost, run_slice, + run_layeredcompbagging, run_garbage, run_average, run_naive_area, @@ -57,6 +58,7 @@ predict_catboost, predict_lightgbm, predict_slice, + predict_layeredcompbagging, predict_ground_truth, predict_spatial_lag, GarbageModel, @@ -1154,6 +1156,8 @@ def get_data_split_for( df_sales = _clean_categoricals(df_sales, fields_cat, settings) df_universe = _clean_categoricals(df_universe, fields_cat, settings) _ind_vars = ind_vars + elif model_engine == "layeredcompbagging": + _ind_vars = ind_vars else: _ind_vars = ind_vars if model_engine == "gwr" or model_engine == "kernel": @@ -1421,6 +1425,10 @@ def run_one_model( results = run_catboost( ds, outpath, save_params, use_saved_params, n_trials=n_trials, verbose=verbose, use_gpu=use_gpu ) + elif model_engine == "layeredcompbagging": + results = run_layeredcompbagging( + ds, outpath, save_params, use_saved_params, n_trials=n_trials, verbose=verbose + ) elif model_engine == "slice": results = run_slice(ds, verbose=verbose) else: diff --git a/openavmkit/modeling.py b/openavmkit/modeling.py index 55d86412..60420d10 100644 --- a/openavmkit/modeling.py +++ b/openavmkit/modeling.py @@ -25,6 +25,7 @@ import xgboost as xgb import lightgbm as lgb import catboost +from layeredcompmodel import LayeredCompBaggingModel as LCBModel from catboost import CatBoostRegressor, Pool from lightgbm import Booster from matplotlib import pyplot as plt @@ -77,6 +78,7 @@ XGBoostModel, LightGBMModel, CatBoostModel, + LayeredCompBaggingModel, MultiMRAModel, GroundTruthModel, SpatialLagModel, @@ -3038,8 +3040,10 @@ def predict_gwr( model_name = ds.name # Organize the parameters - + ## Generate column names, accounting for the intercept + missing = [c for c in ds.ind_vars if c not in ds.X_train.columns] + assert len(missing) == 0, f"Missing variables from dataframe: {missing}" cols = (["intercept"] + list(ds.ind_vars)) if intercept else list(ds.ind_vars) ## Get the key/key sale values to accompany each row @@ -3048,7 +3052,7 @@ def predict_gwr( sales_list_key_sale = ds.df_sales["key_sale"].values.tolist() sales_list_key = ds.df_sales["key"].values.tolist() univ_list_key = ds.df_universe["key"].values.tolist() - + ## Generate dataframes for each set of parameters, and add the keys df_params_test = pd.DataFrame(params_test, columns=cols) df_params_test.insert(0, "key_sale", test_list_key_sale) @@ -3778,6 +3782,142 @@ def run_slice( return predict_slice(ds, slice_model, timing, verbose) +def run_layeredcompbagging( + ds: DataSplit, + outpath: str, + save_params: bool = False, + use_saved_params: bool = False, + n_trials: int = 50, + verbose: bool = False, +) -> SingleModelResults: + """ + Run a LayeredCompBagging model by training and predicting. + + Parameters + ---------- + ds : DataSplit + DataSplit object. + outpath : str + Output path for saving parameters. + save_params : bool, optional + Whether to save trained model. Defaults to False. + use_saved_params : bool, optional + Whether to load saved model. Defaults to False. + n_trials : int, optional + Not used for LayeredCompBagging. Kept for API consistency. Defaults to 50. + verbose : bool, optional + If True, print verbose output. Defaults to False. + + Returns + ------- + SingleModelResults + Prediction results from the LayeredCompBagging model. + """ + + timing = TimingData() + + timing.start("total") + + timing.start("setup") + ds.split() + + # layeredcompmodel internally calls fillna("NaN"); this fails on pandas + # Categorical unless "NaN" is a declared category. Coerce categories + # to plain object dtype for all splits before fit/predict. + ds.X_train = _coerce_categoricals_to_object(ds.X_train) + ds.X_test = _coerce_categoricals_to_object(ds.X_test) + ds.X_sales = _coerce_categoricals_to_object(ds.X_sales) + ds.X_univ = _coerce_categoricals_to_object(ds.X_univ) + timing.stop("setup") + + timing.start("parameter_search") + timing.stop("parameter_search") + + timing.start("train") + + # Train the LayeredCompBagging model + lcb_model = LCBModel(tree_count=10, sample_pct=0.95, random_state=42, n_jobs=4) + lcb_model.fit(ds.X_train, ds.y_train) + + # Wrap it in our wrapper class + wrapped_model = LayeredCompBaggingModel(lcb_model) + + timing.stop("train") + + return predict_layeredcompbagging(ds, wrapped_model, timing, verbose) + + +def predict_layeredcompbagging( + ds: DataSplit, + lcb_model: LayeredCompBaggingModel, + timing: TimingData, + verbose: bool = False, +) -> SingleModelResults: + """ + Generate predictions using a LayeredCompBagging model. + + Parameters + ---------- + ds : DataSplit + DataSplit object containing train/test/universe splits. + lcb_model : LayeredCompBaggingModel + Trained LayeredCompBaggingModel instance. + timing : TimingData + TimingData object for recording performance metrics. + verbose : bool, optional + If True, print verbose output. Defaults to False. + + Returns + ------- + SingleModelResults + Prediction results from the LayeredCompBagging model. + """ + + regressor = lcb_model.model + + timing.start("predict_test") + if len(ds.y_test) == 0: + y_pred_test = np.array([]) + else: + y_pred_test = regressor.predict(ds.X_test) + timing.stop("predict_test") + + timing.start("predict_sales") + if len(ds.y_sales) == 0: + y_pred_sales = np.array([]) + else: + y_pred_sales = regressor.predict(ds.X_sales) + timing.stop("predict_sales") + + timing.start("predict_univ") + if len(ds.X_univ) == 0: + y_pred_univ = np.array([]) + else: + y_pred_univ = regressor.predict(ds.X_univ) + timing.stop("predict_univ") + + timing.stop("total") + + model_name = ds.name + model_engine = "layeredcompbagging" + + results = SingleModelResults( + ds, + "prediction", + "he_id", + model_name, + model_engine, + lcb_model, + y_pred_test, + y_pred_sales, + y_pred_univ, + timing, + verbose=verbose, + ) + + return results + + def predict_garbage( ds: DataSplit, garbage_model: GarbageModel, @@ -6358,6 +6498,9 @@ def write_model_parameters( write_shaps(model, outpath, smr, location, do_plot, verbose=verbose) elif isinstance(model, LocalAreaModel): write_local_area_params(model, smr, outpath, do_plot) + elif isinstance(model, LayeredCompBaggingModel): + # TODO + pass # ...and so on else: raise TypeError(f"Unexpected model type: {type(model).__name__}") @@ -6371,4 +6514,16 @@ def _sanitize_categoricals(X: pd.DataFrame) -> pd.DataFrame: X[c] = X[c].cat.rename_categories(cats.astype("object")) return X + +def _coerce_categoricals_to_object(X: pd.DataFrame) -> pd.DataFrame: + """Convert pandas Categorical columns to object dtype. + + This avoids setitem/fillna category errors in libraries that write string + missing tokens into categorical series. + """ + X = X.copy() + for c in X.select_dtypes(["category"]).columns: + X[c] = X[c].astype("object") + return X + ############################## diff --git a/openavmkit/utilities/modeling.py b/openavmkit/utilities/modeling.py index ebcd8bdc..d92f527d 100644 --- a/openavmkit/utilities/modeling.py +++ b/openavmkit/utilities/modeling.py @@ -3,7 +3,7 @@ from statsmodels.regression.linear_model import RegressionResults from pygam import LinearGAM, s, te import pandas as pd -from typing import Any +from typing import Any, Dict from dataclasses import dataclass from itertools import combinations @@ -557,6 +557,28 @@ def __init__(self, regressor, cat_data): self.cat_data = cat_data +class LayeredCompBaggingModel: + """Layered Comp Bagging Model + + A bagging ensemble version of the LayeredCompModel algorithm that reduces variance + and automatically optimizes the weight_falloff for each tree in the ensemble. + + Attributes + ---------- + model: layeredcompmodel.LayeredCompBaggingModel + The trained LayeredCompBaggingModel from the layeredcompmodel package + """ + def __init__(self, model): + """Initialize a LayeredCompBaggingModel + + Parameters + ---------- + model : layeredcompmodel.LayeredCompBaggingModel + The trained LayeredCompBaggingModel instance + """ + self.model = model + + class MRAModel: """Multiple Regression Analysis Model diff --git a/requirements.txt b/requirements.txt index 29385079..d0d7c1f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ seaborn==0.13.2 statsmodels==0.14.6 xgboost==3.2.0 xlsxwriter==3.2.9 +layeredcompmodel==0.2.1 census==0.8.25 shap==0.50.0 mpld3==0.5.12