From 738faee0d4cdcb18b7bcd8b5603e64b2f254ecbc Mon Sep 17 00:00:00 2001 From: John Kossa Date: Tue, 5 May 2026 12:24:12 -0500 Subject: [PATCH 1/3] Added layered comp bagging model. Fixed a minor GWR issue --- .idea/dictionaries/project.xml | 7 ++ openavmkit/benchmark.py | 8 ++ openavmkit/modeling.py | 159 ++++++++++++++++++++++++++++++- openavmkit/utilities/modeling.py | 24 ++++- requirements.txt | 1 + test_integration.py | 39 ++++++++ 6 files changed, 235 insertions(+), 3 deletions(-) create mode 100644 .idea/dictionaries/project.xml create mode 100644 test_integration.py diff --git a/.idea/dictionaries/project.xml b/.idea/dictionaries/project.xml new file mode 100644 index 00000000..e05bdbb8 --- /dev/null +++ b/.idea/dictionaries/project.xml @@ -0,0 +1,7 @@ + + + + categoricals + + + \ No newline at end of file diff --git a/openavmkit/benchmark.py b/openavmkit/benchmark.py index 69378ddb..5419ccef 100644 --- a/openavmkit/benchmark.py +++ b/openavmkit/benchmark.py @@ -36,6 +36,7 @@ run_lightgbm, run_catboost, run_slice, + run_layeredcompbagging, run_garbage, run_average, run_naive_area, @@ -57,6 +58,7 @@ predict_catboost, predict_lightgbm, predict_slice, + predict_layeredcompbagging, predict_ground_truth, predict_spatial_lag, GarbageModel, @@ -1154,6 +1156,8 @@ def get_data_split_for( df_sales = _clean_categoricals(df_sales, fields_cat, settings) df_universe = _clean_categoricals(df_universe, fields_cat, settings) _ind_vars = ind_vars + elif model_engine == "layeredcompbagging": + _ind_vars = ind_vars else: _ind_vars = ind_vars if model_engine == "gwr" or model_engine == "kernel": @@ -1421,6 +1425,10 @@ def run_one_model( results = run_catboost( ds, outpath, save_params, use_saved_params, n_trials=n_trials, verbose=verbose, use_gpu=use_gpu ) + elif model_engine == "layeredcompbagging": + results = run_layeredcompbagging( + ds, outpath, save_params, use_saved_params, n_trials=n_trials, verbose=verbose + ) elif model_engine == "slice": results = run_slice(ds, verbose=verbose) else: diff --git a/openavmkit/modeling.py b/openavmkit/modeling.py index 55d86412..60420d10 100644 --- a/openavmkit/modeling.py +++ b/openavmkit/modeling.py @@ -25,6 +25,7 @@ import xgboost as xgb import lightgbm as lgb import catboost +from layeredcompmodel import LayeredCompBaggingModel as LCBModel from catboost import CatBoostRegressor, Pool from lightgbm import Booster from matplotlib import pyplot as plt @@ -77,6 +78,7 @@ XGBoostModel, LightGBMModel, CatBoostModel, + LayeredCompBaggingModel, MultiMRAModel, GroundTruthModel, SpatialLagModel, @@ -3038,8 +3040,10 @@ def predict_gwr( model_name = ds.name # Organize the parameters - + ## Generate column names, accounting for the intercept + missing = [c for c in ds.ind_vars if c not in ds.X_train.columns] + assert len(missing) == 0, f"Missing variables from dataframe: {missing}" cols = (["intercept"] + list(ds.ind_vars)) if intercept else list(ds.ind_vars) ## Get the key/key sale values to accompany each row @@ -3048,7 +3052,7 @@ def predict_gwr( sales_list_key_sale = ds.df_sales["key_sale"].values.tolist() sales_list_key = ds.df_sales["key"].values.tolist() univ_list_key = ds.df_universe["key"].values.tolist() - + ## Generate dataframes for each set of parameters, and add the keys df_params_test = pd.DataFrame(params_test, columns=cols) df_params_test.insert(0, "key_sale", test_list_key_sale) @@ -3778,6 +3782,142 @@ def run_slice( return predict_slice(ds, slice_model, timing, verbose) +def run_layeredcompbagging( + ds: DataSplit, + outpath: str, + save_params: bool = False, + use_saved_params: bool = False, + n_trials: int = 50, + verbose: bool = False, +) -> SingleModelResults: + """ + Run a LayeredCompBagging model by training and predicting. + + Parameters + ---------- + ds : DataSplit + DataSplit object. + outpath : str + Output path for saving parameters. + save_params : bool, optional + Whether to save trained model. Defaults to False. + use_saved_params : bool, optional + Whether to load saved model. Defaults to False. + n_trials : int, optional + Not used for LayeredCompBagging. Kept for API consistency. Defaults to 50. + verbose : bool, optional + If True, print verbose output. Defaults to False. + + Returns + ------- + SingleModelResults + Prediction results from the LayeredCompBagging model. + """ + + timing = TimingData() + + timing.start("total") + + timing.start("setup") + ds.split() + + # layeredcompmodel internally calls fillna("NaN"); this fails on pandas + # Categorical unless "NaN" is a declared category. Coerce categories + # to plain object dtype for all splits before fit/predict. + ds.X_train = _coerce_categoricals_to_object(ds.X_train) + ds.X_test = _coerce_categoricals_to_object(ds.X_test) + ds.X_sales = _coerce_categoricals_to_object(ds.X_sales) + ds.X_univ = _coerce_categoricals_to_object(ds.X_univ) + timing.stop("setup") + + timing.start("parameter_search") + timing.stop("parameter_search") + + timing.start("train") + + # Train the LayeredCompBagging model + lcb_model = LCBModel(tree_count=10, sample_pct=0.95, random_state=42, n_jobs=4) + lcb_model.fit(ds.X_train, ds.y_train) + + # Wrap it in our wrapper class + wrapped_model = LayeredCompBaggingModel(lcb_model) + + timing.stop("train") + + return predict_layeredcompbagging(ds, wrapped_model, timing, verbose) + + +def predict_layeredcompbagging( + ds: DataSplit, + lcb_model: LayeredCompBaggingModel, + timing: TimingData, + verbose: bool = False, +) -> SingleModelResults: + """ + Generate predictions using a LayeredCompBagging model. + + Parameters + ---------- + ds : DataSplit + DataSplit object containing train/test/universe splits. + lcb_model : LayeredCompBaggingModel + Trained LayeredCompBaggingModel instance. + timing : TimingData + TimingData object for recording performance metrics. + verbose : bool, optional + If True, print verbose output. Defaults to False. + + Returns + ------- + SingleModelResults + Prediction results from the LayeredCompBagging model. + """ + + regressor = lcb_model.model + + timing.start("predict_test") + if len(ds.y_test) == 0: + y_pred_test = np.array([]) + else: + y_pred_test = regressor.predict(ds.X_test) + timing.stop("predict_test") + + timing.start("predict_sales") + if len(ds.y_sales) == 0: + y_pred_sales = np.array([]) + else: + y_pred_sales = regressor.predict(ds.X_sales) + timing.stop("predict_sales") + + timing.start("predict_univ") + if len(ds.X_univ) == 0: + y_pred_univ = np.array([]) + else: + y_pred_univ = regressor.predict(ds.X_univ) + timing.stop("predict_univ") + + timing.stop("total") + + model_name = ds.name + model_engine = "layeredcompbagging" + + results = SingleModelResults( + ds, + "prediction", + "he_id", + model_name, + model_engine, + lcb_model, + y_pred_test, + y_pred_sales, + y_pred_univ, + timing, + verbose=verbose, + ) + + return results + + def predict_garbage( ds: DataSplit, garbage_model: GarbageModel, @@ -6358,6 +6498,9 @@ def write_model_parameters( write_shaps(model, outpath, smr, location, do_plot, verbose=verbose) elif isinstance(model, LocalAreaModel): write_local_area_params(model, smr, outpath, do_plot) + elif isinstance(model, LayeredCompBaggingModel): + # TODO + pass # ...and so on else: raise TypeError(f"Unexpected model type: {type(model).__name__}") @@ -6371,4 +6514,16 @@ def _sanitize_categoricals(X: pd.DataFrame) -> pd.DataFrame: X[c] = X[c].cat.rename_categories(cats.astype("object")) return X + +def _coerce_categoricals_to_object(X: pd.DataFrame) -> pd.DataFrame: + """Convert pandas Categorical columns to object dtype. + + This avoids setitem/fillna category errors in libraries that write string + missing tokens into categorical series. + """ + X = X.copy() + for c in X.select_dtypes(["category"]).columns: + X[c] = X[c].astype("object") + return X + ############################## diff --git a/openavmkit/utilities/modeling.py b/openavmkit/utilities/modeling.py index ebcd8bdc..d92f527d 100644 --- a/openavmkit/utilities/modeling.py +++ b/openavmkit/utilities/modeling.py @@ -3,7 +3,7 @@ from statsmodels.regression.linear_model import RegressionResults from pygam import LinearGAM, s, te import pandas as pd -from typing import Any +from typing import Any, Dict from dataclasses import dataclass from itertools import combinations @@ -557,6 +557,28 @@ def __init__(self, regressor, cat_data): self.cat_data = cat_data +class LayeredCompBaggingModel: + """Layered Comp Bagging Model + + A bagging ensemble version of the LayeredCompModel algorithm that reduces variance + and automatically optimizes the weight_falloff for each tree in the ensemble. + + Attributes + ---------- + model: layeredcompmodel.LayeredCompBaggingModel + The trained LayeredCompBaggingModel from the layeredcompmodel package + """ + def __init__(self, model): + """Initialize a LayeredCompBaggingModel + + Parameters + ---------- + model : layeredcompmodel.LayeredCompBaggingModel + The trained LayeredCompBaggingModel instance + """ + self.model = model + + class MRAModel: """Multiple Regression Analysis Model diff --git a/requirements.txt b/requirements.txt index 29385079..d0d7c1f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ seaborn==0.13.2 statsmodels==0.14.6 xgboost==3.2.0 xlsxwriter==3.2.9 +layeredcompmodel==0.2.1 census==0.8.25 shap==0.50.0 mpld3==0.5.12 diff --git a/test_integration.py b/test_integration.py new file mode 100644 index 00000000..173c4403 --- /dev/null +++ b/test_integration.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +""" +Integration test for LayeredCompBaggingModel +""" + +print("Testing LayeredCompBaggingModel integration...") + +# Test 1: Import from utilities.modeling +from openavmkit.utilities.modeling import ( + GarbageModel, AverageModel, NaiveAreaModel, LocalAreaModel, + PassThroughModel, GWRModel, MRAModel, XGBoostModel, + LightGBMModel, CatBoostModel, LayeredCompBaggingModel, + MultiMRAModel, GroundTruthModel, SpatialLagModel, + LandSLICEModel, greedy_forward_loocv, TreeBasedCategoricalData +) +print("✓ All model classes imported successfully from utilities.modeling") + +# Test 2: Import from main modeling module +from openavmkit.modeling import LayeredCompBaggingModel +print("✓ LayeredCompBaggingModel imported successfully from main modeling") + +# Test 3: Verify layeredcompmodel package is available +import layeredcompmodel +print("✓ layeredcompmodel package available") + +# Test 4: Quick instantiation test +from layeredcompmodel import LayeredCompBaggingModel as LCBM +lcb = LCBM(tree_count=5) +wrapped = LayeredCompBaggingModel(lcb) +print("✓ LayeredCompBaggingModel wrapper successfully instantiated") + +# Test 5: Verify the wrapped model has the expected structure +assert hasattr(wrapped, 'model'), "Wrapper should have 'model' attribute" +assert isinstance(wrapped.model, LCBM), "Wrapped model should be a LayeredCompBaggingModel instance" +print("✓ Wrapper structure verified") + +print("\n✅ Integration complete and verified!") +print("\nLayeredCompBaggingModel is now available in the openavmkit toolkit!") + From c5f168a28b6f9e252d8f65c623c85b8f2fc46b6a Mon Sep 17 00:00:00 2001 From: John Kossa Date: Tue, 5 May 2026 12:25:49 -0500 Subject: [PATCH 2/3] Delete test_integration.py --- test_integration.py | 39 --------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 test_integration.py diff --git a/test_integration.py b/test_integration.py deleted file mode 100644 index 173c4403..00000000 --- a/test_integration.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python -""" -Integration test for LayeredCompBaggingModel -""" - -print("Testing LayeredCompBaggingModel integration...") - -# Test 1: Import from utilities.modeling -from openavmkit.utilities.modeling import ( - GarbageModel, AverageModel, NaiveAreaModel, LocalAreaModel, - PassThroughModel, GWRModel, MRAModel, XGBoostModel, - LightGBMModel, CatBoostModel, LayeredCompBaggingModel, - MultiMRAModel, GroundTruthModel, SpatialLagModel, - LandSLICEModel, greedy_forward_loocv, TreeBasedCategoricalData -) -print("✓ All model classes imported successfully from utilities.modeling") - -# Test 2: Import from main modeling module -from openavmkit.modeling import LayeredCompBaggingModel -print("✓ LayeredCompBaggingModel imported successfully from main modeling") - -# Test 3: Verify layeredcompmodel package is available -import layeredcompmodel -print("✓ layeredcompmodel package available") - -# Test 4: Quick instantiation test -from layeredcompmodel import LayeredCompBaggingModel as LCBM -lcb = LCBM(tree_count=5) -wrapped = LayeredCompBaggingModel(lcb) -print("✓ LayeredCompBaggingModel wrapper successfully instantiated") - -# Test 5: Verify the wrapped model has the expected structure -assert hasattr(wrapped, 'model'), "Wrapper should have 'model' attribute" -assert isinstance(wrapped.model, LCBM), "Wrapped model should be a LayeredCompBaggingModel instance" -print("✓ Wrapper structure verified") - -print("\n✅ Integration complete and verified!") -print("\nLayeredCompBaggingModel is now available in the openavmkit toolkit!") - From d70926c023b7628415eb0712644eaf96d99a7c07 Mon Sep 17 00:00:00 2001 From: John Kossa Date: Tue, 5 May 2026 12:26:58 -0500 Subject: [PATCH 3/3] Delete .idea/dictionaries/project.xml --- .idea/dictionaries/project.xml | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 .idea/dictionaries/project.xml diff --git a/.idea/dictionaries/project.xml b/.idea/dictionaries/project.xml deleted file mode 100644 index e05bdbb8..00000000 --- a/.idea/dictionaries/project.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - categoricals - - - \ No newline at end of file