Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions openavmkit/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
run_lightgbm,
run_catboost,
run_slice,
run_layeredcompbagging,
run_garbage,
run_average,
run_naive_area,
Expand All @@ -57,6 +58,7 @@
predict_catboost,
predict_lightgbm,
predict_slice,
predict_layeredcompbagging,
predict_ground_truth,
predict_spatial_lag,
GarbageModel,
Expand Down Expand Up @@ -1154,6 +1156,8 @@ def get_data_split_for(
df_sales = _clean_categoricals(df_sales, fields_cat, settings)
df_universe = _clean_categoricals(df_universe, fields_cat, settings)
_ind_vars = ind_vars
elif model_engine == "layeredcompbagging":
_ind_vars = ind_vars
else:
_ind_vars = ind_vars
if model_engine == "gwr" or model_engine == "kernel":
Expand Down Expand Up @@ -1421,6 +1425,10 @@ def run_one_model(
results = run_catboost(
ds, outpath, save_params, use_saved_params, n_trials=n_trials, verbose=verbose, use_gpu=use_gpu
)
elif model_engine == "layeredcompbagging":
results = run_layeredcompbagging(
ds, outpath, save_params, use_saved_params, n_trials=n_trials, verbose=verbose
)
elif model_engine == "slice":
results = run_slice(ds, verbose=verbose)
else:
Expand Down
159 changes: 157 additions & 2 deletions openavmkit/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import xgboost as xgb
import lightgbm as lgb
import catboost
from layeredcompmodel import LayeredCompBaggingModel as LCBModel
from catboost import CatBoostRegressor, Pool
from lightgbm import Booster
from matplotlib import pyplot as plt
Expand Down Expand Up @@ -77,6 +78,7 @@
XGBoostModel,
LightGBMModel,
CatBoostModel,
LayeredCompBaggingModel,
MultiMRAModel,
GroundTruthModel,
SpatialLagModel,
Expand Down Expand Up @@ -3038,8 +3040,10 @@ def predict_gwr(
model_name = ds.name

# Organize the parameters

## Generate column names, accounting for the intercept
missing = [c for c in ds.ind_vars if c not in ds.X_train.columns]
assert len(missing) == 0, f"Missing variables from dataframe: {missing}"
cols = (["intercept"] + list(ds.ind_vars)) if intercept else list(ds.ind_vars)

## Get the key/key sale values to accompany each row
Expand All @@ -3048,7 +3052,7 @@ def predict_gwr(
sales_list_key_sale = ds.df_sales["key_sale"].values.tolist()
sales_list_key = ds.df_sales["key"].values.tolist()
univ_list_key = ds.df_universe["key"].values.tolist()

## Generate dataframes for each set of parameters, and add the keys
df_params_test = pd.DataFrame(params_test, columns=cols)
df_params_test.insert(0, "key_sale", test_list_key_sale)
Expand Down Expand Up @@ -3778,6 +3782,142 @@ def run_slice(
return predict_slice(ds, slice_model, timing, verbose)


def run_layeredcompbagging(
ds: DataSplit,
outpath: str,
save_params: bool = False,
use_saved_params: bool = False,
n_trials: int = 50,
verbose: bool = False,
) -> SingleModelResults:
"""
Run a LayeredCompBagging model by training and predicting.

Parameters
----------
ds : DataSplit
DataSplit object.
outpath : str
Output path for saving parameters.
save_params : bool, optional
Whether to save trained model. Defaults to False.
use_saved_params : bool, optional
Whether to load saved model. Defaults to False.
n_trials : int, optional
Not used for LayeredCompBagging. Kept for API consistency. Defaults to 50.
verbose : bool, optional
If True, print verbose output. Defaults to False.

Returns
-------
SingleModelResults
Prediction results from the LayeredCompBagging model.
"""

timing = TimingData()

timing.start("total")

timing.start("setup")
ds.split()

# layeredcompmodel internally calls fillna("NaN"); this fails on pandas
# Categorical unless "NaN" is a declared category. Coerce categories
# to plain object dtype for all splits before fit/predict.
ds.X_train = _coerce_categoricals_to_object(ds.X_train)
ds.X_test = _coerce_categoricals_to_object(ds.X_test)
ds.X_sales = _coerce_categoricals_to_object(ds.X_sales)
ds.X_univ = _coerce_categoricals_to_object(ds.X_univ)
timing.stop("setup")

timing.start("parameter_search")
timing.stop("parameter_search")

timing.start("train")

# Train the LayeredCompBagging model
lcb_model = LCBModel(tree_count=10, sample_pct=0.95, random_state=42, n_jobs=4)
lcb_model.fit(ds.X_train, ds.y_train)

# Wrap it in our wrapper class
wrapped_model = LayeredCompBaggingModel(lcb_model)

timing.stop("train")

return predict_layeredcompbagging(ds, wrapped_model, timing, verbose)


def predict_layeredcompbagging(
ds: DataSplit,
lcb_model: LayeredCompBaggingModel,
timing: TimingData,
verbose: bool = False,
) -> SingleModelResults:
"""
Generate predictions using a LayeredCompBagging model.

Parameters
----------
ds : DataSplit
DataSplit object containing train/test/universe splits.
lcb_model : LayeredCompBaggingModel
Trained LayeredCompBaggingModel instance.
timing : TimingData
TimingData object for recording performance metrics.
verbose : bool, optional
If True, print verbose output. Defaults to False.

Returns
-------
SingleModelResults
Prediction results from the LayeredCompBagging model.
"""

regressor = lcb_model.model

timing.start("predict_test")
if len(ds.y_test) == 0:
y_pred_test = np.array([])
else:
y_pred_test = regressor.predict(ds.X_test)
timing.stop("predict_test")

timing.start("predict_sales")
if len(ds.y_sales) == 0:
y_pred_sales = np.array([])
else:
y_pred_sales = regressor.predict(ds.X_sales)
timing.stop("predict_sales")

timing.start("predict_univ")
if len(ds.X_univ) == 0:
y_pred_univ = np.array([])
else:
y_pred_univ = regressor.predict(ds.X_univ)
timing.stop("predict_univ")

timing.stop("total")

model_name = ds.name
model_engine = "layeredcompbagging"

results = SingleModelResults(
ds,
"prediction",
"he_id",
model_name,
model_engine,
lcb_model,
y_pred_test,
y_pred_sales,
y_pred_univ,
timing,
verbose=verbose,
)

return results


def predict_garbage(
ds: DataSplit,
garbage_model: GarbageModel,
Expand Down Expand Up @@ -6358,6 +6498,9 @@ def write_model_parameters(
write_shaps(model, outpath, smr, location, do_plot, verbose=verbose)
elif isinstance(model, LocalAreaModel):
write_local_area_params(model, smr, outpath, do_plot)
elif isinstance(model, LayeredCompBaggingModel):
# TODO
pass
# ...and so on
else:
raise TypeError(f"Unexpected model type: {type(model).__name__}")
Expand All @@ -6371,4 +6514,16 @@ def _sanitize_categoricals(X: pd.DataFrame) -> pd.DataFrame:
X[c] = X[c].cat.rename_categories(cats.astype("object"))
return X


def _coerce_categoricals_to_object(X: pd.DataFrame) -> pd.DataFrame:
"""Convert pandas Categorical columns to object dtype.

This avoids setitem/fillna category errors in libraries that write string
missing tokens into categorical series.
"""
X = X.copy()
for c in X.select_dtypes(["category"]).columns:
X[c] = X[c].astype("object")
return X

##############################
24 changes: 23 additions & 1 deletion openavmkit/utilities/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from statsmodels.regression.linear_model import RegressionResults
from pygam import LinearGAM, s, te
import pandas as pd
from typing import Any
from typing import Any, Dict

from dataclasses import dataclass
from itertools import combinations
Expand Down Expand Up @@ -557,6 +557,28 @@ def __init__(self, regressor, cat_data):
self.cat_data = cat_data


class LayeredCompBaggingModel:
"""Layered Comp Bagging Model

A bagging ensemble version of the LayeredCompModel algorithm that reduces variance
and automatically optimizes the weight_falloff for each tree in the ensemble.

Attributes
----------
model: layeredcompmodel.LayeredCompBaggingModel
The trained LayeredCompBaggingModel from the layeredcompmodel package
"""
def __init__(self, model):
"""Initialize a LayeredCompBaggingModel

Parameters
----------
model : layeredcompmodel.LayeredCompBaggingModel
The trained LayeredCompBaggingModel instance
"""
self.model = model


class MRAModel:
"""Multiple Regression Analysis Model

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ seaborn==0.13.2
statsmodels==0.14.6
xgboost==3.2.0
xlsxwriter==3.2.9
layeredcompmodel==0.2.1
census==0.8.25
shap==0.50.0
mpld3==0.5.12
Expand Down
Loading