From 92dc4d9b76e4ea0bba8a831268e6d5fd7ff5ae4a Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 2 Apr 2026 01:55:28 +0000 Subject: [PATCH 1/5] Vectorize building-parcel intersection area calculation in OvertureService Replace slow row-by-row apply() with vectorized geopandas operations: build a GeoSeries of building geometries aligned with the joined dataframe, then call .intersection() and .area in one pass instead of per-row. https://claude.ai/code/session_01Dmffw6eWUC7rrE6JD1SXXX --- openavmkit/utilities/overture.py | 33 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/openavmkit/utilities/overture.py b/openavmkit/utilities/overture.py index 53e7ff83..82874dbb 100644 --- a/openavmkit/utilities/overture.py +++ b/openavmkit/utilities/overture.py @@ -395,26 +395,21 @@ def calculate_building_footprints( if verbose: print(f"--> Found {len(joined)} potential building-parcel intersections") - def calculate_intersection_area(row): - try: - parcel_geom = gdf_projected.loc[row.name, "geometry"] - building_idx = row["index_right"] - if pd.isna(building_idx): - return 0.0 - building_geom = buildings_projected.loc[building_idx, "geometry"] - if parcel_geom.intersects(building_geom): - intersection = parcel_geom.intersection(building_geom) - return intersection.area * unit_mult # Convert to desired units - return 0.0 - except Exception as e: - if verbose: - print(f"Warning: Error calculating intersection area: {e}") - return 0.0 - t.start("calc_area") - # TODO: Optimize this step using vectorized operations if possible - # Calculate intersection areas - joined[field_name] = joined.apply(calculate_intersection_area, axis=1) + # Vectorized intersection area calculation + joined[field_name] = 0.0 + valid_mask = ~joined["index_right"].isna() + if valid_mask.any(): + valid_joined = joined[valid_mask] + building_geoms = buildings_projected.loc[ + valid_joined["index_right"].astype(int), "geometry" + ].values + building_gs = gpd.GeoSeries( + building_geoms, index=valid_joined.index, crs=buildings_projected.crs + ) + joined.loc[valid_mask, field_name] = ( + valid_joined["geometry"].intersection(building_gs).area * unit_mult + ) t.stop("calc_area") if verbose: From f0224c37f04f2d9ae64609db438d15bf5a9c3e08 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 2 Apr 2026 03:34:44 +0000 Subject: [PATCH 2/5] Implement LYCD (Least You Can Do) land valuation method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds calc_lycd_land_values() to land.py, which values land by: 1. Computing a uniform local land rate per model group from the median market value and median lot size of improved properties 2. Applying that rate to every parcel (land_value = rate × lot_size) This avoids the side-by-side disparity created by applying a flat allocation % to each parcel's individual market value. Supports a user-supplied land_alloc (float or per-group dict) or automatic derivation from vacant vs improved per-unit value ratios. Also fixes missing area_unit import in land.py. https://claude.ai/code/session_011PMWoWpQazUGuCXKT1nmVg --- openavmkit/land.py | 162 +++++++++++++++++++++++++++++++++++++++++- tests/test_land.py | 171 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 331 insertions(+), 2 deletions(-) create mode 100644 tests/test_land.py diff --git a/openavmkit/land.py b/openavmkit/land.py index 61a53656..f15a1c32 100644 --- a/openavmkit/land.py +++ b/openavmkit/land.py @@ -22,7 +22,7 @@ add_area_fields, ) from openavmkit.utilities.plotting import plot_histogram_df -from openavmkit.utilities.settings import get_model_group_ids +from openavmkit.utilities.settings import area_unit, get_model_group_ids from openavmkit.utilities.stats import calc_correlations, calc_cod, calc_r2, calc_mse_r2_adj_r2 @@ -787,4 +787,162 @@ def _convolve_land_analysis( print(f"TEST LAND RESULTS, MODEL GROUP : {model_group}, count: {count}") print("=" * 80) print(df_results_test.to_string()) - print("") \ No newline at end of file + print("") + + +def calc_lycd_land_values( + df: pd.DataFrame, + settings: dict, + land_alloc: "float | dict | None" = None, + market_value_field: str = "model_market_value", +) -> pd.DataFrame: + """Compute land values using the "Least You Can Do" (LYCD) method. + + For each model group this method: + + 1. Takes the median market value and median lot size of **improved** + (non-vacant) properties in the group. + 2. Derives a uniform local land rate:: + + local_land_rate = (median_market_value * land_alloc_pct) / median_lot_size + + 3. Applies that rate to every parcel:: + + land_value = local_land_rate * parcel_lot_size + + Because every parcel's land value is driven by the *typical* parcel in its + area rather than its own improvement value, the method avoids the absurd + side-by-side disparities that arise from naively multiplying each parcel's + market value by a fixed allocation fraction. + + When ``land_alloc`` is ``None`` the allocation for each group is derived + automatically from the data. For each group the median per-unit value of + **vacant** properties is divided by the median per-unit value of **improved** + properties; that ratio is the implied land allocation. If a group has no + vacant properties the global ratio (across all groups combined) is used as a + fallback. + + Parameters + ---------- + df : pd.DataFrame + DataFrame containing at minimum: + + - ``"key"`` + - ``"model_group"`` + - ``market_value_field`` (default ``"model_market_value"``) + - ``"is_vacant"`` (bool) + - ``f"land_area_{unit}"`` where *unit* is ``"sqft"`` or ``"sqm"`` + + settings : dict + Settings dictionary (used to determine the area unit). + land_alloc : float, dict, or None + Fraction of market value attributable to land. + + - **float** – applied uniformly to all model groups (e.g. ``0.20`` for 20 %). + - **dict** – ``{model_group_id: float}`` for per-group allocations. + - **None** – derived automatically from the ratio of vacant-to-improved + per-unit market values within each group. + + market_value_field : str + Column name holding the market value estimate. Defaults to + ``"model_market_value"``. + + Returns + ------- + pd.DataFrame + Copy of *df* with three additional columns: + + - ``"lycd_land_alloc"`` – land allocation fraction used for the group. + - ``"lycd_local_land_rate"`` – implied per-area-unit land rate for the group. + - ``"lycd_land_value"`` – resulting land value (clamped to ``[0, market_value]``). + """ + unit = area_unit(settings) + lot_area_field = f"land_area_{unit}" + + df = df.copy() + model_groups = df["model_group"].unique() + + # Resolve land_alloc to a per-group dict + if isinstance(land_alloc, dict): + alloc_by_group = {mg: land_alloc.get(mg, np.nan) for mg in model_groups} + elif land_alloc is not None: + alloc_by_group = {mg: float(land_alloc) for mg in model_groups} + else: + alloc_by_group = _derive_lycd_alloc_from_data(df, lot_area_field, market_value_field) + + local_land_rates = {} + resolved_allocs = {} + + for mg in model_groups: + mask_improved = df["model_group"].eq(mg) & df["is_vacant"].eq(False) + df_improved = df[mask_improved] + + alloc = alloc_by_group.get(mg, np.nan) + resolved_allocs[mg] = alloc + + if len(df_improved) == 0 or np.isnan(alloc): + local_land_rates[mg] = np.nan + continue + + median_mv = df_improved[market_value_field].median() + median_lot = df_improved[lot_area_field].median() + + if median_lot <= 0 or np.isnan(median_lot) or np.isnan(median_mv): + local_land_rates[mg] = np.nan + continue + + local_land_rates[mg] = (median_mv * alloc) / median_lot + + df["lycd_land_alloc"] = df["model_group"].map(resolved_allocs) + df["lycd_local_land_rate"] = df["model_group"].map(local_land_rates) + df["lycd_land_value"] = df["lycd_local_land_rate"] * df[lot_area_field] + + # Clamp: must be >= 0 and <= market value + df["lycd_land_value"] = df["lycd_land_value"].clip(lower=0) + exceeds_mv = df["lycd_land_value"].gt(df[market_value_field]) + df.loc[exceeds_mv, "lycd_land_value"] = df.loc[exceeds_mv, market_value_field] + + return df + + +def _derive_lycd_alloc_from_data( + df: pd.DataFrame, + lot_area_field: str, + market_value_field: str, +) -> dict: + """Derive per-group land allocation fractions from vacant vs improved values. + + For each model group the allocation is: + + median(market_value / lot_area) for vacant parcels + ───────────────────────────────────────────────── + median(market_value / lot_area) for improved parcels + + If a group has no vacant parcels the global ratio is used as a fallback. + """ + def _per_unit_median(mask): + sub = df[mask].copy() + sub = sub[sub[lot_area_field].gt(0)] + rates = sub[market_value_field] / sub[lot_area_field] + return rates.median() + + # Global fallback + global_vacant_rate = _per_unit_median(df["is_vacant"].eq(True)) + global_improved_rate = _per_unit_median(df["is_vacant"].eq(False)) + if global_improved_rate > 0 and not np.isnan(global_vacant_rate): + global_alloc = global_vacant_rate / global_improved_rate + else: + global_alloc = np.nan + + result = {} + for mg in df["model_group"].unique(): + mask_mg = df["model_group"].eq(mg) + vacant_rate = _per_unit_median(mask_mg & df["is_vacant"].eq(True)) + improved_rate = _per_unit_median(mask_mg & df["is_vacant"].eq(False)) + + if improved_rate > 0 and not np.isnan(vacant_rate): + result[mg] = vacant_rate / improved_rate + else: + result[mg] = global_alloc + + return result \ No newline at end of file diff --git a/tests/test_land.py b/tests/test_land.py new file mode 100644 index 00000000..2487da96 --- /dev/null +++ b/tests/test_land.py @@ -0,0 +1,171 @@ +import numpy as np +import pandas as pd +import pytest + +from openavmkit.land import calc_lycd_land_values + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_settings(units="imperial"): + return {"locality": {"units": units}} + + +def _make_df(): + """Return a tiny universe with two model groups and a mix of + vacant / improved properties.""" + return pd.DataFrame({ + "key": ["A1", "A2", "A3", "A4", "B1", "B2", "B3"], + "model_group": ["G1", "G1", "G1", "G1", "G2", "G2", "G2"], + "model_market_value": [200_000, 250_000, 300_000, 50_000, + 400_000, 500_000, 80_000], + "land_area_sqft": [5_000, 6_000, 7_000, 4_000, + 10_000, 12_000, 8_000], + # A4 and B3 are vacant lots; the rest are improved + "is_vacant": [False, False, False, True, + False, False, True], + }) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +def test_lycd_scalar_alloc(): + """With a fixed 20 % allocation every group gets the same land rate formula.""" + df = _make_df() + settings = _make_settings() + result = calc_lycd_land_values(df, settings, land_alloc=0.20) + + assert "lycd_land_value" in result.columns + assert "lycd_local_land_rate" in result.columns + assert "lycd_land_alloc" in result.columns + + # --- Group G1 --- + # improved: A1(200k/5k), A2(250k/6k), A3(300k/7k) + # median_mv_improved = 250_000 + # median_lot_improved = 6_000 + # local_land_rate = (250_000 * 0.20) / 6_000 ≈ 8.3333 + g1_rate = (250_000 * 0.20) / 6_000 + for _, row in result[result["model_group"] == "G1"].iterrows(): + assert abs(row["lycd_local_land_rate"] - g1_rate) < 1e-6 + expected_lv = min(g1_rate * row["land_area_sqft"], row["model_market_value"]) + assert abs(row["lycd_land_value"] - expected_lv) < 1e-2 + assert abs(row["lycd_land_alloc"] - 0.20) < 1e-10 + + # --- Group G2 --- + # improved: B1(400k/10k), B2(500k/12k) + # median_mv_improved = 450_000 + # median_lot_improved = 11_000 + # local_land_rate = (450_000 * 0.20) / 11_000 + g2_rate = (450_000 * 0.20) / 11_000 + for _, row in result[result["model_group"] == "G2"].iterrows(): + assert abs(row["lycd_local_land_rate"] - g2_rate) < 1e-6 + + +def test_lycd_dict_alloc(): + """Per-group allocations are applied correctly.""" + df = _make_df() + settings = _make_settings() + alloc = {"G1": 0.15, "G2": 0.25} + result = calc_lycd_land_values(df, settings, land_alloc=alloc) + + g1_rows = result[result["model_group"] == "G1"] + g2_rows = result[result["model_group"] == "G2"] + + assert (g1_rows["lycd_land_alloc"] - 0.15).abs().max() < 1e-10 + assert (g2_rows["lycd_land_alloc"] - 0.25).abs().max() < 1e-10 + + +def test_lycd_auto_alloc_from_vacant(): + """When land_alloc=None the allocation is derived from vacant vs improved + per-unit values within each group.""" + df = _make_df() + settings = _make_settings() + result = calc_lycd_land_values(df, settings, land_alloc=None) + + # --- Group G1 --- + # vacant: A4 → 50_000 / 4_000 = 12.5 $/sqft (only one, median = 12.5) + # improved per-unit medians: + # A1: 200_000/5_000 = 40.0 + # A2: 250_000/6_000 ≈ 41.667 + # A3: 300_000/7_000 ≈ 42.857 + # median ≈ 41.667 + # implied alloc = 12.5 / 41.667 ≈ 0.3000 + g1_vacant_rate = 50_000 / 4_000 + g1_improved_rates = sorted([200_000 / 5_000, 250_000 / 6_000, 300_000 / 7_000]) + g1_improved_median = np.median(g1_improved_rates) + expected_g1_alloc = g1_vacant_rate / g1_improved_median + + g1_rows = result[result["model_group"] == "G1"] + assert (g1_rows["lycd_land_alloc"] - expected_g1_alloc).abs().max() < 1e-6 + + # land values must be non-negative + assert (result["lycd_land_value"] >= 0).all() + + +def test_lycd_clamping(): + """Land value is clamped to [0, market_value].""" + # Construct a case where the raw rate would overshoot + df = pd.DataFrame({ + "key": ["X1", "X2"], + "model_group": ["G1", "G1"], + "model_market_value": [100_000, 100_000], + "land_area_sqft": [1_000, 100_000], # huge lot → would overshoot + "is_vacant": [False, False], + }) + settings = _make_settings() + result = calc_lycd_land_values(df, settings, land_alloc=0.20) + + # No land value may exceed the market value + assert (result["lycd_land_value"] <= result["model_market_value"] + 1e-6).all() + assert (result["lycd_land_value"] >= 0).all() + + +def test_lycd_metric_units(): + """Works with metric (sqm) settings.""" + df = pd.DataFrame({ + "key": ["M1", "M2", "M3"], + "model_group": ["G1", "G1", "G1"], + "model_market_value": [200_000, 300_000, 50_000], + "land_area_sqm": [500, 700, 400], + "is_vacant": [False, False, True], + }) + settings = _make_settings(units="metric") + result = calc_lycd_land_values(df, settings, land_alloc=0.20) + + # median improved: mv=250k, lot=600 sqm + # rate = (250_000 * 0.20) / 600 + expected_rate = (250_000 * 0.20) / 600 + assert (result["lycd_local_land_rate"] - expected_rate).abs().max() < 1e-6 + + +def test_lycd_no_vacant_falls_back_to_global(): + """A group with no vacant properties falls back to the global allocation.""" + df = pd.DataFrame({ + "key": ["A1", "A2", "B1", "B2", "B3"], + "model_group": ["G1", "G1", "G2", "G2", "G2"], + "model_market_value": [200_000, 300_000, 400_000, 500_000, 80_000], + "land_area_sqft": [5_000, 7_000, 10_000, 12_000, 8_000], + # G1 has no vacant; G2 has B3 as vacant + "is_vacant": [False, False, False, False, True], + }) + settings = _make_settings() + result = calc_lycd_land_values(df, settings, land_alloc=None) + + # Both groups should receive a valid (non-NaN) allocation + assert result["lycd_land_alloc"].notna().all() + # The G1 allocation should equal the global allocation + g1_alloc = result[result["model_group"] == "G1"]["lycd_land_alloc"].iloc[0] + g2_alloc = result[result["model_group"] == "G2"]["lycd_land_alloc"].iloc[0] + + # G2 has its own vacant data so it will differ; G1 must be the global fallback + # Global: all vacant = B3 (80k/8k=10), all improved = A1,A2,B1,B2 + all_improved_rates = sorted([200_000/5_000, 300_000/7_000, 400_000/10_000, 500_000/12_000]) + global_improved_median = np.median(all_improved_rates) + global_vacant_rate = 80_000 / 8_000 + expected_global_alloc = global_vacant_rate / global_improved_median + + assert abs(g1_alloc - expected_global_alloc) < 1e-6 From 73e2f705cb979ad566d2385bf50b7de5a4f6db2e Mon Sep 17 00:00:00 2001 From: Russell Richie Date: Tue, 31 Mar 2026 12:02:34 -0400 Subject: [PATCH 3/5] fix: use positional alignment in _contrib_to_unit_values and _add_prediction_to_contribution to prevent cartesian-product OOM Co-Authored-By: Claude Sonnet 4.6 --- openavmkit/modeling.py | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/openavmkit/modeling.py b/openavmkit/modeling.py index 88a6163f..9acd5291 100644 --- a/openavmkit/modeling.py +++ b/openavmkit/modeling.py @@ -6207,8 +6207,27 @@ def _contrib_to_unit_values(df_contrib: pd.DataFrame, df_base: pd.DataFrame, spl drop_from_base = [c for c in reserved if c != the_key and c in df_base.columns] df_base_trim = df_base.drop(columns=drop_from_base) - # merge - df_merged = df_contrib_renamed.merge(df_base_trim, on=the_key, how="left") + # Use positional alignment instead of a key-based merge. + # df_contrib is built row-by-row from df_base (same order), so positional + # alignment is correct and avoids the pandas non-unique-join path that + # produces a cartesian product when df_base has a non-sequential index + # (e.g. after sort_values) and an Arrow-backed string key dtype that + # triggers the slow path in pandas merge internals. On a ~421k-row + # universe this manifests as a 421k² OOM crash. + if len(df_contrib_renamed) != len(df_base_trim): + import warnings + warnings.warn( + f"_contrib_to_unit_values: contrib ({len(df_contrib_renamed)} rows) and " + f"base ({len(df_base_trim)} rows) have different sizes; " + f"falling back to key-based merge on '{the_key}'.", + UserWarning, + ) + df_merged = df_contrib_renamed.merge(df_base_trim, on=the_key, how="left") + else: + df_contrib_aligned = df_contrib_renamed.reset_index(drop=True) + df_base_aligned = df_base_trim.reset_index(drop=True) + cols_to_add = [c for c in df_base_aligned.columns if c not in df_contrib_aligned.columns] + df_merged = pd.concat([df_contrib_aligned, df_base_aligned[cols_to_add]], axis=1) # compute per-unit contributions normal_vars = [] @@ -6246,7 +6265,17 @@ def _add_prediction_to_contribution( ): the_key = "key_sale" if "key_sale" in df.columns and "univ" not in split_name else "key" df_pred = df[[the_key, "prediction"]] - df_combined = df_contrib.merge(df_pred, on=the_key, how="left") + # Use positional alignment to avoid cartesian-product OOM (same root cause + # as _contrib_to_unit_values: df has a non-sequential index from + # sort_values, while df_contrib has 0..N-1, and an Arrow-backed string key + # dtype triggers the slow path in pandas merge internals). + if len(df_contrib) == len(df_pred): + df_combined = pd.concat( + [df_contrib.reset_index(drop=True), df_pred[["prediction"]].reset_index(drop=True)], + axis=1, + ) + else: + df_combined = df_contrib.merge(df_pred, on=the_key, how="left") df_combined["check_delta"] = df_combined["prediction"] - df_combined["contribution_sum"] return df_combined From 25e8c9091b9509d224d89176cd843b1337e7c879 Mon Sep 17 00:00:00 2001 From: Russell Richie Date: Fri, 3 Apr 2026 15:37:08 -0400 Subject: [PATCH 4/5] feat: add subarea_field support to LYCD land valuation calc_lycd_land_values now accepts a subarea_field parameter (e.g. "neighborhood", "census_tract") to derive a separate local land rate per (subarea, model_group) cell instead of one rate per model group across the entire jurisdiction. Cells below min_improved_per_cell fall back to the model-group rate. _derive_lycd_alloc_from_data updated to match with group_field, subarea_field, and alloc_by_group_fallback parameters. Co-Authored-By: Claude Sonnet 4.6 --- openavmkit/land.py | 209 +++++++++++++++++++++++++++++++++------------ 1 file changed, 154 insertions(+), 55 deletions(-) diff --git a/openavmkit/land.py b/openavmkit/land.py index f15a1c32..b9af6d09 100644 --- a/openavmkit/land.py +++ b/openavmkit/land.py @@ -795,18 +795,20 @@ def calc_lycd_land_values( settings: dict, land_alloc: "float | dict | None" = None, market_value_field: str = "model_market_value", + subarea_field: "str | None" = None, + min_improved_per_cell: int = 10, ) -> pd.DataFrame: """Compute land values using the "Least You Can Do" (LYCD) method. - For each model group this method: + For each local area this method: 1. Takes the median market value and median lot size of **improved** - (non-vacant) properties in the group. + (non-vacant) properties in the area. 2. Derives a uniform local land rate:: local_land_rate = (median_market_value * land_alloc_pct) / median_lot_size - 3. Applies that rate to every parcel:: + 3. Applies that rate to every parcel in the area:: land_value = local_land_rate * parcel_lot_size @@ -815,12 +817,18 @@ def calc_lycd_land_values( side-by-side disparities that arise from naively multiplying each parcel's market value by a fixed allocation fraction. - When ``land_alloc`` is ``None`` the allocation for each group is derived - automatically from the data. For each group the median per-unit value of - **vacant** properties is divided by the median per-unit value of **improved** - properties; that ratio is the implied land allocation. If a group has no - vacant properties the global ratio (across all groups combined) is used as a - fallback. + When ``subarea_field`` is provided, rates are computed per + ``(subarea, model_group)`` cell rather than per model group alone. This + captures the sharp neighbourhood-to-neighbourhood price jumps that a single + county-wide rate per land-use type would miss. Cells with fewer than + ``min_improved_per_cell`` improved parcels fall back first to the + model-group rate, then to the global rate. + + When ``land_alloc`` is ``None`` the allocation for each cell is derived + automatically from the data. The median per-unit value of **vacant** + properties is divided by the median per-unit value of **improved** + properties; that ratio is the implied land allocation. Fallback order: + cell → model group → global. Parameters ---------- @@ -832,28 +840,41 @@ def calc_lycd_land_values( - ``market_value_field`` (default ``"model_market_value"``) - ``"is_vacant"`` (bool) - ``f"land_area_{unit}"`` where *unit* is ``"sqft"`` or ``"sqm"`` + - ``subarea_field`` column, if provided settings : dict Settings dictionary (used to determine the area unit). land_alloc : float, dict, or None Fraction of market value attributable to land. - - **float** – applied uniformly to all model groups (e.g. ``0.20`` for 20 %). - - **dict** – ``{model_group_id: float}`` for per-group allocations. + - **float** – applied uniformly to all cells. + - **dict** – keyed by ``model_group`` (``{mg: float}``) for per-group + allocations; individual cells inherit their group's allocation. - **None** – derived automatically from the ratio of vacant-to-improved - per-unit market values within each group. + per-unit market values, at the finest available level. market_value_field : str Column name holding the market value estimate. Defaults to ``"model_market_value"``. + subarea_field : str or None + Column name whose values define geographic subareas (e.g. + ``"neighborhood"``, ``"census_tract"``). When supplied, one local land + rate is derived per ``(subarea, model_group)`` cell. When ``None`` + (default), a single rate is derived per ``model_group``, which is + equivalent to treating the entire county as one area per land-use type. + min_improved_per_cell : int + Minimum number of improved parcels required in a + ``(subarea, model_group)`` cell before that cell gets its own rate. + Cells below this threshold fall back to the model-group rate. + Ignored when ``subarea_field`` is ``None``. Default: 10. Returns ------- pd.DataFrame Copy of *df* with three additional columns: - - ``"lycd_land_alloc"`` – land allocation fraction used for the group. - - ``"lycd_local_land_rate"`` – implied per-area-unit land rate for the group. + - ``"lycd_land_alloc"`` – land allocation fraction used for the cell. + - ``"lycd_local_land_rate"`` – implied per-area-unit land rate for the cell. - ``"lycd_land_value"`` – resulting land value (clamped to ``[0, market_value]``). """ unit = area_unit(settings) @@ -862,39 +883,95 @@ def calc_lycd_land_values( df = df.copy() model_groups = df["model_group"].unique() - # Resolve land_alloc to a per-group dict + # ------------------------------------------------------------------ + # Resolve land_alloc to a per-model-group dict (used as fallback even + # when subarea_field is set, and as the primary source when it is not). + # ------------------------------------------------------------------ if isinstance(land_alloc, dict): alloc_by_group = {mg: land_alloc.get(mg, np.nan) for mg in model_groups} elif land_alloc is not None: alloc_by_group = {mg: float(land_alloc) for mg in model_groups} else: - alloc_by_group = _derive_lycd_alloc_from_data(df, lot_area_field, market_value_field) + alloc_by_group = _derive_lycd_alloc_from_data( + df, lot_area_field, market_value_field, group_field="model_group" + ) - local_land_rates = {} - resolved_allocs = {} + # ------------------------------------------------------------------ + # Helper: compute one local land rate for a slice of the dataframe. + # Returns np.nan if the slice is too small or the medians are invalid. + # ------------------------------------------------------------------ + def _rate_for_mask(mask_improved, alloc): + sub = df[mask_improved] + if len(sub) == 0 or np.isnan(alloc): + return np.nan + median_mv = sub[market_value_field].median() + median_lot = sub[lot_area_field].median() + if median_lot <= 0 or np.isnan(median_lot) or np.isnan(median_mv): + return np.nan + return (median_mv * alloc) / median_lot + # ------------------------------------------------------------------ + # Build per-model-group fallback rates (always needed). + # ------------------------------------------------------------------ + group_rates = {} for mg in model_groups: - mask_improved = df["model_group"].eq(mg) & df["is_vacant"].eq(False) - df_improved = df[mask_improved] - - alloc = alloc_by_group.get(mg, np.nan) - resolved_allocs[mg] = alloc - - if len(df_improved) == 0 or np.isnan(alloc): - local_land_rates[mg] = np.nan - continue - - median_mv = df_improved[market_value_field].median() - median_lot = df_improved[lot_area_field].median() - - if median_lot <= 0 or np.isnan(median_lot) or np.isnan(median_mv): - local_land_rates[mg] = np.nan - continue + mask = df["model_group"].eq(mg) & df["is_vacant"].eq(False) + group_rates[mg] = _rate_for_mask(mask, alloc_by_group.get(mg, np.nan)) + + # ------------------------------------------------------------------ + # Assign rates and allocs row-by-row into output columns. + # ------------------------------------------------------------------ + df["lycd_land_alloc"] = np.nan + df["lycd_local_land_rate"] = np.nan + + if subarea_field is None: + # No geographic subdivision: one rate per model group. + for mg in model_groups: + idx = df["model_group"].eq(mg) + df.loc[idx, "lycd_local_land_rate"] = group_rates[mg] + df.loc[idx, "lycd_land_alloc"] = alloc_by_group.get(mg, np.nan) + else: + # Geographic subdivision: one rate per (subarea, model_group) cell, + # with fallback to model-group rate for thin cells. + subareas = df[subarea_field].unique() + + # Pre-derive per-cell allocations when land_alloc is None. + if land_alloc is None: + cell_allocs = _derive_lycd_alloc_from_data( + df, + lot_area_field, + market_value_field, + group_field="model_group", + subarea_field=subarea_field, + alloc_by_group_fallback=alloc_by_group, + ) + else: + cell_allocs = {} # will use alloc_by_group for every cell + + for mg in model_groups: + mask_mg = df["model_group"].eq(mg) + for sa in subareas: + mask_sa = df[subarea_field].eq(sa) + mask_cell_improved = mask_mg & mask_sa & df["is_vacant"].eq(False) + n_improved = mask_cell_improved.sum() + idx = mask_mg & mask_sa + + if n_improved >= min_improved_per_cell: + cell_key = (sa, mg) + alloc = cell_allocs.get(cell_key, alloc_by_group.get(mg, np.nan)) + rate = _rate_for_mask(mask_cell_improved, alloc) + if np.isnan(rate): + # Cell medians invalid despite enough rows; fall back. + rate = group_rates[mg] + alloc = alloc_by_group.get(mg, np.nan) + else: + # Too few improved parcels in this cell; use group rate. + rate = group_rates[mg] + alloc = alloc_by_group.get(mg, np.nan) - local_land_rates[mg] = (median_mv * alloc) / median_lot + df.loc[idx, "lycd_local_land_rate"] = rate + df.loc[idx, "lycd_land_alloc"] = alloc - df["lycd_land_alloc"] = df["model_group"].map(resolved_allocs) - df["lycd_local_land_rate"] = df["model_group"].map(local_land_rates) df["lycd_land_value"] = df["lycd_local_land_rate"] * df[lot_area_field] # Clamp: must be >= 0 and <= market value @@ -909,16 +986,16 @@ def _derive_lycd_alloc_from_data( df: pd.DataFrame, lot_area_field: str, market_value_field: str, + group_field: str = "model_group", + subarea_field: "str | None" = None, + alloc_by_group_fallback: "dict | None" = None, ) -> dict: - """Derive per-group land allocation fractions from vacant vs improved values. - - For each model group the allocation is: - - median(market_value / lot_area) for vacant parcels - ───────────────────────────────────────────────── - median(market_value / lot_area) for improved parcels + """Derive land allocation fractions from vacant vs improved per-unit values. - If a group has no vacant parcels the global ratio is used as a fallback. + When ``subarea_field`` is None, returns ``{model_group: alloc}``. + When ``subarea_field`` is provided, returns ``{(subarea, model_group): alloc}``, + falling back to the group-level allocation for cells with no vacant parcels, + and to the global allocation if the group also has none. """ def _per_unit_median(mask): sub = df[mask].copy() @@ -929,20 +1006,42 @@ def _per_unit_median(mask): # Global fallback global_vacant_rate = _per_unit_median(df["is_vacant"].eq(True)) global_improved_rate = _per_unit_median(df["is_vacant"].eq(False)) - if global_improved_rate > 0 and not np.isnan(global_vacant_rate): + if global_improved_rate > 0 and not ( + np.isnan(global_vacant_rate) or np.isnan(global_improved_rate) + ): global_alloc = global_vacant_rate / global_improved_rate else: global_alloc = np.nan - result = {} - for mg in df["model_group"].unique(): - mask_mg = df["model_group"].eq(mg) - vacant_rate = _per_unit_median(mask_mg & df["is_vacant"].eq(True)) - improved_rate = _per_unit_median(mask_mg & df["is_vacant"].eq(False)) - - if improved_rate > 0 and not np.isnan(vacant_rate): - result[mg] = vacant_rate / improved_rate + # Per-group allocations (always computed; used as fallback for cells). + group_allocs = {} + for mg in df[group_field].unique(): + mask_mg = df[group_field].eq(mg) + vr = _per_unit_median(mask_mg & df["is_vacant"].eq(True)) + ir = _per_unit_median(mask_mg & df["is_vacant"].eq(False)) + if ir > 0 and not (np.isnan(vr) or np.isnan(ir)): + group_allocs[mg] = vr / ir else: - result[mg] = global_alloc + group_allocs[mg] = ( + alloc_by_group_fallback.get(mg, global_alloc) + if alloc_by_group_fallback + else global_alloc + ) + + if subarea_field is None: + return group_allocs + + # Per-(subarea, group) allocations with group → global fallback. + result = {} + for mg in df[group_field].unique(): + mask_mg = df[group_field].eq(mg) + for sa in df[subarea_field].unique(): + mask_sa = df[subarea_field].eq(sa) + vr = _per_unit_median(mask_mg & mask_sa & df["is_vacant"].eq(True)) + ir = _per_unit_median(mask_mg & mask_sa & df["is_vacant"].eq(False)) + if ir > 0 and not (np.isnan(vr) or np.isnan(ir)): + result[(sa, mg)] = vr / ir + else: + result[(sa, mg)] = group_allocs.get(mg, global_alloc) return result \ No newline at end of file From 2e31553d24d112266fc55bf8f7c58cf54b0abfca Mon Sep 17 00:00:00 2001 From: Russell Richie Date: Mon, 13 Apr 2026 10:32:16 -0400 Subject: [PATCH 5/5] fix: accumulated compatibility patches for pyarrow/pandas/scipy compat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Split-checkpoint contributions strategy (benchmark, pipeline, modeling) - Thread `do_contributions` flag through the full call chain: `run_models`, `_run_models`, `run_one_model`, `run_one_hedonic_model`, `_predict_one_model`, `_run_hedonic_models`, `_write_model_results`, `write_model_parameters` - Add `_DS` and `_SMRContribContext` helper classes (module-level, picklable) - `_write_model_results`: save `_smr_for_contribs.pkl` + `_model_features.json` when `do_contributions=False`; enables deferred SHAP pass checkpointing - New `compute_model_contributions(settings)` in pipeline.py: loads saved `_smr_for_contribs.pkl` files per model group, runs contributions pass, deletes pkl; allows crash recovery without retraining - `finalize_models`: expose `run_main/vacant/hedonic/ensemble`, `do_shaps`, `do_plots`, `do_contributions` as real parameters (were hardcoded True/False) - `_run_models`: add per-group `ind_vars` override from `settings.modeling.models..group_overrides..ind_vars` ## Bug fixes - `data.py`: `from scipy.spatial import cKDTree` (private submodule removed in scipy >= 1.14) - `data.py`: move `astype(int)` cast to after `.loc` override in `_basic_geo_enrichment` — pandas 2.x rejects float→int64 via `.loc` - `sales_scrutiny_study.py`: go through `.astype(object)` before `.astype(str)` to avoid ArrowNotImplementedError on null-dtype columns (pyarrow >= 22) - `shap_analysis.py`: warn + filter instead of raising ValueError when `list_vars` contains features dropped by variable selection - `tuning.py`: guard `_lightgbm_rolling_origin_cv` against n_samples < n_splits - `utilities/cache.py`: replace `col.values == ...` with `col.eq(...)` and wrap `.sum()` in `int()` — ArrowExtensionArray has no `.values` sum - `utilities/stats.py`: multiple `calc_correlations` guards for sparse/small groups — empty naive_corr early-exit, all-NA score break, `len(remaining) <= 1` early return with correct schema, `.columns.tolist()[0]` for Arrow-backed Index compat ## Misc - `.gitignore`: add `.DS_Store` and `.pytest_cache/` Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 6 ++ openavmkit/benchmark.py | 124 +++++++++++++++++++++++++---- openavmkit/data.py | 9 ++- openavmkit/modeling.py | 7 +- openavmkit/pipeline.py | 95 ++++++++++++++++++++-- openavmkit/sales_scrutiny_study.py | 4 +- openavmkit/shap_analysis.py | 7 +- openavmkit/tuning.py | 2 + openavmkit/utilities/cache.py | 6 +- openavmkit/utilities/stats.py | 16 +++- 10 files changed, 238 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index 80dc3a63..7f4190c2 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,9 @@ tests/cache/ cache/ out/ + +# OS +.DS_Store + +# Testing +.pytest_cache/ diff --git a/openavmkit/benchmark.py b/openavmkit/benchmark.py index 6185707c..94cc9052 100644 --- a/openavmkit/benchmark.py +++ b/openavmkit/benchmark.py @@ -840,7 +840,8 @@ def run_models( run_hedonic: bool = True, run_ensemble: bool = True, do_shaps: bool = False, - do_plots: bool = False + do_plots: bool = False, + do_contributions: bool = True, ): """ Runs predictive models on the given SalesUniversePair. @@ -945,7 +946,8 @@ def run_models( verbose, run_ensemble, do_shaps=do_shaps, - do_plots=do_plots + do_plots=do_plots, + do_contributions=do_contributions, ) if mg_results is not None and save_results: dict_all_results[model_group] = mg_results @@ -1199,6 +1201,7 @@ def run_one_model( hedonic: bool = False, test_keys: list[str] | None = None, train_keys: list[str] | None = None, + do_contributions: bool = True, ) -> SingleModelResults | None: """ Run a single model based on provided parameters and return its results. @@ -1406,7 +1409,7 @@ def run_one_model( t.start("write") main_vacant_hedonic = "hedonic" if hedonic else "vacant" if vacant_only else "main" location = get_model_location(settings, main_vacant_hedonic, model_name) - _write_model_results(results, outpath, settings, location, verbose=verbose) + _write_model_results(results, outpath, settings, location, verbose=verbose, do_contributions=do_contributions) t.stop("write") return results @@ -1428,6 +1431,7 @@ def run_one_hedonic_model( hedonic_test_against_vacant_sales: bool = True, save_results: bool = False, verbose: bool = False, + do_contributions: bool = True, ): """Run a single hedonic model based on provided parameters and return its results. @@ -1513,6 +1517,7 @@ def run_one_hedonic_model( settings=settings, save_results=save_results, verbose=verbose, + do_contributions=do_contributions, ) return results @@ -1766,6 +1771,7 @@ def _predict_one_model( settings: dict, save_results: bool = False, verbose: bool = False, + do_contributions: bool = True, ) -> SingleModelResults: """ Predict results for one model, using saved results if available. @@ -1834,15 +1840,15 @@ def _predict_one_model( results = predict_slice(ds, slice_model, timing, verbose) if save_results: - + mvh = settings.get("modeling", {}).get("models", {}).get(main_vacant_hedonic, {}) model_entry = mvh.get("model_name", mvh.get("default", {})) location = model_entry.get("location", None) if location is None: location = get_important_field(settings, "loc_neighborhood") - + location = get_model_location(settings, main_vacant_hedonic, model_name) - _write_model_results(results, outpath, settings, location, verbose=verbose) + _write_model_results(results, outpath, settings, location, verbose=verbose, do_contributions=do_contributions) return results @@ -2130,13 +2136,61 @@ def _assemble_model_results(results: SingleModelResults, settings: dict): return dfs -def _write_model_results(results: SingleModelResults, outpath: str, settings: dict, location: str = None, verbose:bool = False): +class _DS: + """Minimal stand-in for DataSplit, used only to carry ind_vars and X matrices for the deferred contributions pass.""" + def __init__(self, ind_vars, X_test, X_sales, X_univ): + self.ind_vars = ind_vars + self.X_test = X_test + self.X_sales = X_sales + self.X_univ = X_univ + + +class _SMRContribContext: + """ + Minimal context needed by write_model_parameters / write_shaps for the + contributions-only pass. Holds the trained model, ind_vars, X matrices, + and the four DataFrames (geometry stripped to reduce pickle size). + """ + def __init__(self, model, ind_vars, df_train, df_test, df_sales, df_universe): + self.model = model + # Subset to ind_vars only — avoid storing the full DataFrames twice + safe_ind_vars = [v for v in ind_vars if v in df_test.columns] + self.ds = _DS( + ind_vars=ind_vars, + X_test=df_test[safe_ind_vars], + X_sales=df_sales[safe_ind_vars], + X_univ=df_universe[safe_ind_vars], + ) + self.df_train = df_train + self.df_test = df_test + self.df_sales = df_sales + self.df_universe = df_universe + + +def _write_model_results( + results: SingleModelResults, + outpath: str, + settings: dict, + location: str = None, + verbose: bool = False, + do_contributions: bool = True, +): """ Write model results to disk in parquet and CSV formats. + + Parameters + ---------- + do_contributions : bool, optional + If True (default), compute and write SHAP contribution CSVs immediately. + If False, skip contributions and instead save a ``_smr_for_contribs.pkl`` + file alongside the predictions so that + :func:`openavmkit.pipeline.compute_model_contributions` can run the + contributions pass separately (allowing it to be checkpointed + independently of model training). """ - + print(f"Write model results to {outpath}") - + dfs = _assemble_model_results(results, settings) path = f"{outpath}/{results.model_name}" if "*" in path: @@ -2144,11 +2198,11 @@ def _write_model_results(results: SingleModelResults, outpath: str, settings: di os.makedirs(path, exist_ok=True) for key in dfs: df = dfs[key] - + if "geometry" in df.columns: df = gpd.GeoDataFrame(df, geometry="geometry", crs=getattr(df, "crs", None)) df = ensure_geometries(df) - + df.to_parquet(f"{path}/pred_{key}.parquet") if "geometry" in df: df = df.drop(columns=["geometry"]) @@ -2165,10 +2219,30 @@ def _write_model_results(results: SingleModelResults, outpath: str, settings: di with open(f"{path}/pred_universe.pkl", "wb") as f: pickle.dump(results.pred_univ, f, protocol=pickle.HIGHEST_PROTOCOL) - + params_path = f"{path}" - - write_model_parameters(results.model, results, location, params_path, verbose=verbose) + + if not do_contributions: + # Save the minimal context needed for the deferred contributions pass. + # Geometry columns are stripped to reduce pickle size; all other columns + # (needed by _contrib_to_unit_values) are preserved. + ctx = _SMRContribContext( + model=results.model, + ind_vars=results.ds.ind_vars, + df_train=results.df_train.drop(columns=["geometry"], errors="ignore"), + df_test=results.df_test.drop(columns=["geometry"], errors="ignore"), + df_sales=results.df_sales.drop(columns=["geometry"], errors="ignore"), + df_universe=results.df_universe.drop(columns=["geometry"], errors="ignore"), + ) + smr_pkl_path = f"{path}/_smr_for_contribs.pkl" + with open(smr_pkl_path, "wb") as f: + pickle.dump(ctx, f, protocol=pickle.HIGHEST_PROTOCOL) + import json as _json + with open(f"{path}/_model_features.json", "w") as f: + _json.dump({"ind_vars": list(ctx.ds.ind_vars)}, f) + print(f" Contributions deferred — saved context to {smr_pkl_path}") + + write_model_parameters(results.model, results, location, params_path, verbose=verbose, do_contributions=do_contributions) def get_model_location( @@ -3506,7 +3580,8 @@ def _run_hedonic_models( verbose: bool = False, save_results: bool = False, run_ensemble: bool = True, - do_plots: bool = False + do_plots: bool = False, + do_contributions: bool = True, ): """ Run hedonic models and ensemble them, then update the benchmark. @@ -4143,7 +4218,8 @@ def _run_models( verbose: bool = False, run_ensemble: bool = True, do_shaps: bool = False, - do_plots: bool = False + do_plots: bool = False, + do_contributions: bool = True, ): """ Run models for a given model group and process ensemble results. @@ -4200,6 +4276,18 @@ def _run_models( model_entries = settings_model.get("models").get(main_vacant_hedonic, {}) + # Apply per-group ind_vars overrides from settings.modeling.models..group_overrides. + _group_ind_vars = ( + model_entries + .get("group_overrides", {}) + .get(model_group, {}) + .get("ind_vars", None) + ) + if _group_ind_vars is not None: + import copy + model_entries = copy.deepcopy(model_entries) + model_entries.setdefault("default", {})["ind_vars"] = _group_ind_vars + if models_to_run is None: models_to_run = list(model_entries.keys()) @@ -4299,6 +4387,7 @@ def _run_models( use_saved_params=use_saved_params, save_results=save_results, verbose=verbose, + do_contributions=do_contributions, ) if results is not None: model_results[model_name] = results @@ -4415,7 +4504,8 @@ def _run_models( verbose=verbose, save_results=save_results, run_ensemble=run_ensemble, - do_plots=do_plots + do_plots=do_plots, + do_contributions=do_contributions, ) t.stop("run hedonic models") diff --git a/openavmkit/data.py b/openavmkit/data.py index e8e601b3..1d8296e7 100644 --- a/openavmkit/data.py +++ b/openavmkit/data.py @@ -27,7 +27,7 @@ import pyarrow.parquet as pq import geopandas as gpd -from scipy.spatial._ckdtree import cKDTree +from scipy.spatial import cKDTree from shapely.geometry import Polygon from shapely.geometry import LineString from shapely.ops import unary_union @@ -2866,13 +2866,14 @@ def _basic_geo_enrichment( gdf[f"land_area_{unit}"] = gdf[f"land_area_{unit}"].combine_first( gdf[f"land_area_gis_{unit}"] ) - gdf[f"land_area_{unit}"] = np.round( - gdf[f"land_area_{unit}"].combine_first(gdf[f"land_area_gis_{unit}"]) - ).astype(int) + gdf[f"land_area_{unit}"] = gdf[f"land_area_{unit}"].combine_first( + gdf[f"land_area_gis_{unit}"] + ) gdf.loc[ gdf[f"land_area_given_{unit}"].le(0) | gdf[f"land_area_given_{unit}"].isna(), f"land_area_{unit}", ] = gdf[f"land_area_gis_{unit}"] + gdf[f"land_area_{unit}"] = np.round(gdf[f"land_area_{unit}"]).astype(int) # Calculate difference gdf[f"land_area_gis_delta_{unit}"] = gdf[f"land_area_gis_{unit}"] - gdf[f"land_area_{unit}"] diff --git a/openavmkit/modeling.py b/openavmkit/modeling.py index 9acd5291..f1c70123 100644 --- a/openavmkit/modeling.py +++ b/openavmkit/modeling.py @@ -6286,9 +6286,12 @@ def write_model_parameters( location: str, outpath: str, do_plot: bool = False, - verbose: bool = False + verbose: bool = False, + do_contributions: bool = True, ): - + if not do_contributions: + return + print(f"write model parameters to {outpath}") xs = { "test": smr.ds.X_test, diff --git a/openavmkit/pipeline.py b/openavmkit/pipeline.py index a179f8eb..be854a74 100644 --- a/openavmkit/pipeline.py +++ b/openavmkit/pipeline.py @@ -1656,6 +1656,13 @@ def finalize_models( save_params: bool = True, use_saved_params: bool = True, verbose: bool = False, + run_main: bool = True, + run_vacant: bool = True, + run_hedonic: bool = True, + run_ensemble: bool = True, + do_shaps: bool = False, + do_plots: bool = False, + do_contributions: bool = True, ) -> None: """ Tries out predictive models on the given SalesUniversePair, finalizes results and writes to disk. @@ -1680,6 +1687,18 @@ def finalize_models( Whether to use saved model parameters. verbose : bool, optional If True, prints additional information. + run_main : bool, optional + Whether to run main (improved) models. + run_vacant : bool, optional + Whether to run vacant land models. + run_hedonic : bool, optional + Whether to run hedonic (land/improvement split) models. + run_ensemble : bool, optional + Whether to run ensemble models. + do_shaps : bool, optional + Whether to compute SHAP values. + do_plots : bool, optional + Whether to generate plots. Returns ------- @@ -1694,15 +1713,79 @@ def finalize_models( use_saved_params, save_results=True, verbose=verbose, - run_main=True, - run_vacant=True, - run_hedonic=True, - run_ensemble=True, - do_shaps=False, - do_plots=False + run_main=run_main, + run_vacant=run_vacant, + run_hedonic=run_hedonic, + run_ensemble=run_ensemble, + do_shaps=do_shaps, + do_plots=do_plots, + do_contributions=do_contributions, ) +def compute_model_contributions(settings: dict, verbose: bool = False) -> None: + """ + Compute SHAP contribution CSVs from saved model artifacts. + + This is the second half of a split checkpoint strategy. Call + :func:`finalize_models` with ``do_contributions=False`` first (wrapped in + a ``3-model-02a-train-predict`` checkpoint) to train models and save + predictions. Then call this function (wrapped in a + ``3-model-02b-contributions`` checkpoint) to compute contributions + separately, so that a crash during the slow SHAP pass does not require + retraining. + + Looks for ``_smr_for_contribs.pkl`` files written by + :func:`_write_model_results` and calls + :func:`openavmkit.modeling.write_model_parameters` for each one, then + deletes the pickle. + + Parameters + ---------- + settings : dict + The settings dictionary (used to determine model location). + verbose : bool, optional + If True, prints additional information. + """ + import glob + import pickle + import os + + from openavmkit.modeling import write_model_parameters + from openavmkit.benchmark import get_model_location + + pkl_files = sorted(glob.glob("out/models/*/*/*/_smr_for_contribs.pkl")) + if not pkl_files: + print("compute_model_contributions: no _smr_for_contribs.pkl files found — nothing to do.") + return + + for pkl_path in pkl_files: + # path is e.g. out/models/residential_sf/main/lightgbm/_smr_for_contribs.pkl + parts = pkl_path.replace("\\", "/").split("/") + model_group = parts[-4] # e.g. residential_sf + mvh = parts[-3] # e.g. main / vacant + model_name = parts[-2] # e.g. lightgbm + outpath = "/".join(parts[:-1]) # directory containing the pkl + + print(f" Computing contributions: {model_group}/{mvh}/{model_name}") + + with open(pkl_path, "rb") as f: + ctx = pickle.load(f) + + location = get_model_location(settings, mvh, model_name) + write_model_parameters( + ctx.model, + ctx, + location, + outpath, + verbose=verbose, + do_contributions=True, + ) + + os.remove(pkl_path) + print(f" Done — removed {pkl_path}") + + def run_models( sup: SalesUniversePair, settings: dict, diff --git a/openavmkit/sales_scrutiny_study.py b/openavmkit/sales_scrutiny_study.py index d981bc71..49da1d1b 100644 --- a/openavmkit/sales_scrutiny_study.py +++ b/openavmkit/sales_scrutiny_study.py @@ -111,8 +111,8 @@ def __init__(self, df: pd.DataFrame, settings: dict, model_group: str): for key in stuff: df = stuff[key] df, cluster_fields = _mark_sales_scrutiny_clusters(df, settings) - df["ss_id"] = df["ss_id"].astype(str) - df["ss_id"] = df["model_group"] + "_" + key + "_" + df["ss_id"] + df["ss_id"] = df["ss_id"].astype(object).fillna("").astype(str) + df["ss_id"] = df["model_group"].astype(object).astype(str) + "_" + key + "_" + df["ss_id"] per_area = "" denominator = "" if key == "i": diff --git a/openavmkit/shap_analysis.py b/openavmkit/shap_analysis.py index ce4302ee..edc6baa1 100644 --- a/openavmkit/shap_analysis.py +++ b/openavmkit/shap_analysis.py @@ -213,7 +213,12 @@ def make_shap_table( existing = list(expl.feature_names) missing = [c for c in list_vars if c not in existing] if missing: - raise ValueError(f"These list_vars are missing from explanation features: {missing}") + import warnings + warnings.warn( + f"These list_vars are missing from explanation features " + f"(likely dropped by variable selection): {missing}" + ) + list_vars = [c for c in list_vars if c in existing] feature_cols = list_vars # enforce this order df_features = pd.DataFrame(vals, columns=expl.feature_names) diff --git a/openavmkit/tuning.py b/openavmkit/tuning.py index 1e7706ec..74ecb45c 100644 --- a/openavmkit/tuning.py +++ b/openavmkit/tuning.py @@ -485,6 +485,8 @@ def _catboost_rolling_origin_cv( def _lightgbm_rolling_origin_cv(X, y, params, n_splits=5, random_state=42, cat_vars=None): + if len(X) < n_splits: + return float("inf") kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state) mape_scores = [] diff --git a/openavmkit/utilities/cache.py b/openavmkit/utilities/cache.py index d7301e26..43accfdf 100644 --- a/openavmkit/utilities/cache.py +++ b/openavmkit/utilities/cache.py @@ -270,11 +270,11 @@ def write_cached_df( is_different = False if len(col_new) == len(col_orig): - values_equal = col_new.values == col_orig.values + values_equal = col_new.eq(col_orig) na_equal = col_new.isna() & col_orig.isna() - count_na_equal = na_equal.sum() - count_values_equal = values_equal.sum() + count_na_equal = int(na_equal.sum()) + count_values_equal = int(values_equal.sum()) count_to_match = len(col_new) diff --git a/openavmkit/utilities/stats.py b/openavmkit/utilities/stats.py index 75303d3e..f76439ab 100644 --- a/openavmkit/utilities/stats.py +++ b/openavmkit/utilities/stats.py @@ -694,8 +694,11 @@ def calc_correlations( while True: naive_corr = corr_full.loc[remaining, remaining] + if naive_corr.empty or len(naive_corr.columns) == 0: + break + # --- NEW: compute one explicit order and apply to both axes - order = _order_by_target_corr(naive_corr, target_col=naive_corr.columns[0]) + order = _order_by_target_corr(naive_corr, target_col=naive_corr.columns.tolist()[0]) naive_corr = naive_corr.loc[order, order] # ----------------------------------------------- @@ -713,6 +716,8 @@ def calc_correlations( score = strength * clarity * clarity + if score.isna().all(): + break min_score_idx = score.idxmin() try: min_score = score[min_score_idx] @@ -736,6 +741,11 @@ def calc_correlations( # drop all other variables that are more correlated with current_variable than # they are with the target. # ------------------------------------------------------------------ + if len(remaining) <= 1: + # Only target variable (or nothing) remains — skip second pass + _empty = pd.DataFrame(columns=["variable", "corr_strength", "corr_clarity", "corr_score"]) + _stub = first_run if first_run is not None else _empty + return {"initial": _stub, "final": _empty, "bad_vars": bad_vars} target_col = remaining[0] # process in descending strength-to-target order (excluding target itself) @@ -777,7 +787,7 @@ def calc_correlations( # Recompute scores for the *final* remaining set (same math as above) naive_corr = corr_full.loc[remaining, remaining] - order = _order_by_target_corr(naive_corr, target_col=naive_corr.columns[0]) + order = _order_by_target_corr(naive_corr, target_col=naive_corr.columns.tolist()[0]) naive_corr = naive_corr.loc[order, order] naive_corr_sans_target = naive_corr.iloc[1:, 1:] @@ -803,7 +813,7 @@ def calc_correlations( final_corr = corr_full.loc[remaining, remaining] # --- NEW: order final_corr the same way (by target corr within remaining set) - final_order = _order_by_target_corr(final_corr, target_col=final_corr.columns[0]) + final_order = _order_by_target_corr(final_corr, target_col=final_corr.columns.tolist()[0]) final_corr = final_corr.loc[final_order, final_order] # -----------------------------------------------