From 92dc4d9b76e4ea0bba8a831268e6d5fd7ff5ae4a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 2 Apr 2026 01:55:28 +0000
Subject: [PATCH 1/5] Vectorize building-parcel intersection area calculation
 in OvertureService

Replace slow row-by-row apply() with vectorized geopandas operations:
build a GeoSeries of building geometries aligned with the joined dataframe,
then call .intersection() and .area in one pass instead of per-row.

https://claude.ai/code/session_01Dmffw6eWUC7rrE6JD1SXXX
---
 openavmkit/utilities/overture.py | 33 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/openavmkit/utilities/overture.py b/openavmkit/utilities/overture.py
index 53e7ff83..82874dbb 100644
--- a/openavmkit/utilities/overture.py
+++ b/openavmkit/utilities/overture.py
@@ -395,26 +395,21 @@ def calculate_building_footprints(
         if verbose:
             print(f"--> Found {len(joined)} potential building-parcel intersections")
 
-        def calculate_intersection_area(row):
-            try:
-                parcel_geom = gdf_projected.loc[row.name, "geometry"]
-                building_idx = row["index_right"]
-                if pd.isna(building_idx):
-                    return 0.0
-                building_geom = buildings_projected.loc[building_idx, "geometry"]
-                if parcel_geom.intersects(building_geom):
-                    intersection = parcel_geom.intersection(building_geom)
-                    return intersection.area * unit_mult  # Convert to desired units
-                return 0.0
-            except Exception as e:
-                if verbose:
-                    print(f"Warning: Error calculating intersection area: {e}")
-                return 0.0
-
         t.start("calc_area")
-        # TODO: Optimize this step using vectorized operations if possible
-        # Calculate intersection areas
-        joined[field_name] = joined.apply(calculate_intersection_area, axis=1)
+        # Vectorized intersection area calculation
+        joined[field_name] = 0.0
+        valid_mask = ~joined["index_right"].isna()
+        if valid_mask.any():
+            valid_joined = joined[valid_mask]
+            building_geoms = buildings_projected.loc[
+                valid_joined["index_right"].astype(int), "geometry"
+            ].values
+            building_gs = gpd.GeoSeries(
+                building_geoms, index=valid_joined.index, crs=buildings_projected.crs
+            )
+            joined.loc[valid_mask, field_name] = (
+                valid_joined["geometry"].intersection(building_gs).area * unit_mult
+            )
         t.stop("calc_area")
 
         if verbose:

From f0224c37f04f2d9ae64609db438d15bf5a9c3e08 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 2 Apr 2026 03:34:44 +0000
Subject: [PATCH 2/5] Implement LYCD (Least You Can Do) land valuation method
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds calc_lycd_land_values() to land.py, which values land by:
1. Computing a uniform local land rate per model group from the median
   market value and median lot size of improved properties
2. Applying that rate to every parcel (land_value = rate × lot_size)

This avoids the side-by-side disparity created by applying a flat
allocation % to each parcel's individual market value.

Supports a user-supplied land_alloc (float or per-group dict) or
automatic derivation from vacant vs improved per-unit value ratios.
Also fixes missing area_unit import in land.py.

https://claude.ai/code/session_011PMWoWpQazUGuCXKT1nmVg
---
 openavmkit/land.py | 162 +++++++++++++++++++++++++++++++++++++++++-
 tests/test_land.py | 171 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 331 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_land.py

diff --git a/openavmkit/land.py b/openavmkit/land.py
index 61a53656..f15a1c32 100644
--- a/openavmkit/land.py
+++ b/openavmkit/land.py
@@ -22,7 +22,7 @@
     add_area_fields,
 )
 from openavmkit.utilities.plotting import plot_histogram_df
-from openavmkit.utilities.settings import get_model_group_ids
+from openavmkit.utilities.settings import area_unit, get_model_group_ids
 
 from openavmkit.utilities.stats import calc_correlations, calc_cod, calc_r2, calc_mse_r2_adj_r2
 
@@ -787,4 +787,162 @@ def _convolve_land_analysis(
         print(f"TEST LAND RESULTS, MODEL GROUP : {model_group}, count: {count}")
         print("=" * 80)
         print(df_results_test.to_string())
-        print("")
\ No newline at end of file
+        print("")
+
+
+def calc_lycd_land_values(
+    df: pd.DataFrame,
+    settings: dict,
+    land_alloc: "float | dict | None" = None,
+    market_value_field: str = "model_market_value",
+) -> pd.DataFrame:
+    """Compute land values using the "Least You Can Do" (LYCD) method.
+
+    For each model group this method:
+
+    1. Takes the median market value and median lot size of **improved**
+       (non-vacant) properties in the group.
+    2. Derives a uniform local land rate::
+
+           local_land_rate = (median_market_value * land_alloc_pct) / median_lot_size
+
+    3. Applies that rate to every parcel::
+
+           land_value = local_land_rate * parcel_lot_size
+
+    Because every parcel's land value is driven by the *typical* parcel in its
+    area rather than its own improvement value, the method avoids the absurd
+    side-by-side disparities that arise from naively multiplying each parcel's
+    market value by a fixed allocation fraction.
+
+    When ``land_alloc`` is ``None`` the allocation for each group is derived
+    automatically from the data.  For each group the median per-unit value of
+    **vacant** properties is divided by the median per-unit value of **improved**
+    properties; that ratio is the implied land allocation.  If a group has no
+    vacant properties the global ratio (across all groups combined) is used as a
+    fallback.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing at minimum:
+
+        - ``"key"``
+        - ``"model_group"``
+        - ``market_value_field`` (default ``"model_market_value"``)
+        - ``"is_vacant"`` (bool)
+        - ``f"land_area_{unit}"`` where *unit* is ``"sqft"`` or ``"sqm"``
+
+    settings : dict
+        Settings dictionary (used to determine the area unit).
+    land_alloc : float, dict, or None
+        Fraction of market value attributable to land.
+
+        - **float** – applied uniformly to all model groups (e.g. ``0.20`` for 20 %).
+        - **dict** – ``{model_group_id: float}`` for per-group allocations.
+        - **None** – derived automatically from the ratio of vacant-to-improved
+          per-unit market values within each group.
+
+    market_value_field : str
+        Column name holding the market value estimate.  Defaults to
+        ``"model_market_value"``.
+
+    Returns
+    -------
+    pd.DataFrame
+        Copy of *df* with three additional columns:
+
+        - ``"lycd_land_alloc"`` – land allocation fraction used for the group.
+        - ``"lycd_local_land_rate"`` – implied per-area-unit land rate for the group.
+        - ``"lycd_land_value"`` – resulting land value (clamped to ``[0, market_value]``).
+    """
+    unit = area_unit(settings)
+    lot_area_field = f"land_area_{unit}"
+
+    df = df.copy()
+    model_groups = df["model_group"].unique()
+
+    # Resolve land_alloc to a per-group dict
+    if isinstance(land_alloc, dict):
+        alloc_by_group = {mg: land_alloc.get(mg, np.nan) for mg in model_groups}
+    elif land_alloc is not None:
+        alloc_by_group = {mg: float(land_alloc) for mg in model_groups}
+    else:
+        alloc_by_group = _derive_lycd_alloc_from_data(df, lot_area_field, market_value_field)
+
+    local_land_rates = {}
+    resolved_allocs = {}
+
+    for mg in model_groups:
+        mask_improved = df["model_group"].eq(mg) & df["is_vacant"].eq(False)
+        df_improved = df[mask_improved]
+
+        alloc = alloc_by_group.get(mg, np.nan)
+        resolved_allocs[mg] = alloc
+
+        if len(df_improved) == 0 or np.isnan(alloc):
+            local_land_rates[mg] = np.nan
+            continue
+
+        median_mv = df_improved[market_value_field].median()
+        median_lot = df_improved[lot_area_field].median()
+
+        if median_lot <= 0 or np.isnan(median_lot) or np.isnan(median_mv):
+            local_land_rates[mg] = np.nan
+            continue
+
+        local_land_rates[mg] = (median_mv * alloc) / median_lot
+
+    df["lycd_land_alloc"] = df["model_group"].map(resolved_allocs)
+    df["lycd_local_land_rate"] = df["model_group"].map(local_land_rates)
+    df["lycd_land_value"] = df["lycd_local_land_rate"] * df[lot_area_field]
+
+    # Clamp: must be >= 0 and <= market value
+    df["lycd_land_value"] = df["lycd_land_value"].clip(lower=0)
+    exceeds_mv = df["lycd_land_value"].gt(df[market_value_field])
+    df.loc[exceeds_mv, "lycd_land_value"] = df.loc[exceeds_mv, market_value_field]
+
+    return df
+
+
+def _derive_lycd_alloc_from_data(
+    df: pd.DataFrame,
+    lot_area_field: str,
+    market_value_field: str,
+) -> dict:
+    """Derive per-group land allocation fractions from vacant vs improved values.
+
+    For each model group the allocation is:
+
+        median(market_value / lot_area) for vacant parcels
+        ─────────────────────────────────────────────────
+        median(market_value / lot_area) for improved parcels
+
+    If a group has no vacant parcels the global ratio is used as a fallback.
+    """
+    def _per_unit_median(mask):
+        sub = df[mask].copy()
+        sub = sub[sub[lot_area_field].gt(0)]
+        rates = sub[market_value_field] / sub[lot_area_field]
+        return rates.median()
+
+    # Global fallback
+    global_vacant_rate = _per_unit_median(df["is_vacant"].eq(True))
+    global_improved_rate = _per_unit_median(df["is_vacant"].eq(False))
+    if global_improved_rate > 0 and not np.isnan(global_vacant_rate):
+        global_alloc = global_vacant_rate / global_improved_rate
+    else:
+        global_alloc = np.nan
+
+    result = {}
+    for mg in df["model_group"].unique():
+        mask_mg = df["model_group"].eq(mg)
+        vacant_rate = _per_unit_median(mask_mg & df["is_vacant"].eq(True))
+        improved_rate = _per_unit_median(mask_mg & df["is_vacant"].eq(False))
+
+        if improved_rate > 0 and not np.isnan(vacant_rate):
+            result[mg] = vacant_rate / improved_rate
+        else:
+            result[mg] = global_alloc
+
+    return result
\ No newline at end of file
diff --git a/tests/test_land.py b/tests/test_land.py
new file mode 100644
index 00000000..2487da96
--- /dev/null
+++ b/tests/test_land.py
@@ -0,0 +1,171 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from openavmkit.land import calc_lycd_land_values
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_settings(units="imperial"):
+    return {"locality": {"units": units}}
+
+
+def _make_df():
+    """Return a tiny universe with two model groups and a mix of
+    vacant / improved properties."""
+    return pd.DataFrame({
+        "key":                ["A1", "A2", "A3", "A4", "B1", "B2", "B3"],
+        "model_group":        ["G1", "G1", "G1", "G1", "G2", "G2", "G2"],
+        "model_market_value": [200_000, 250_000, 300_000, 50_000,
+                               400_000, 500_000, 80_000],
+        "land_area_sqft":     [5_000, 6_000, 7_000, 4_000,
+                               10_000, 12_000, 8_000],
+        # A4 and B3 are vacant lots; the rest are improved
+        "is_vacant":          [False, False, False, True,
+                               False, False, True],
+    })
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+def test_lycd_scalar_alloc():
+    """With a fixed 20 % allocation every group gets the same land rate formula."""
+    df = _make_df()
+    settings = _make_settings()
+    result = calc_lycd_land_values(df, settings, land_alloc=0.20)
+
+    assert "lycd_land_value" in result.columns
+    assert "lycd_local_land_rate" in result.columns
+    assert "lycd_land_alloc" in result.columns
+
+    # --- Group G1 ---
+    # improved: A1(200k/5k), A2(250k/6k), A3(300k/7k)
+    # median_mv_improved  = 250_000
+    # median_lot_improved = 6_000
+    # local_land_rate     = (250_000 * 0.20) / 6_000  ≈ 8.3333
+    g1_rate = (250_000 * 0.20) / 6_000
+    for _, row in result[result["model_group"] == "G1"].iterrows():
+        assert abs(row["lycd_local_land_rate"] - g1_rate) < 1e-6
+        expected_lv = min(g1_rate * row["land_area_sqft"], row["model_market_value"])
+        assert abs(row["lycd_land_value"] - expected_lv) < 1e-2
+        assert abs(row["lycd_land_alloc"] - 0.20) < 1e-10
+
+    # --- Group G2 ---
+    # improved: B1(400k/10k), B2(500k/12k)
+    # median_mv_improved  = 450_000
+    # median_lot_improved = 11_000
+    # local_land_rate     = (450_000 * 0.20) / 11_000
+    g2_rate = (450_000 * 0.20) / 11_000
+    for _, row in result[result["model_group"] == "G2"].iterrows():
+        assert abs(row["lycd_local_land_rate"] - g2_rate) < 1e-6
+
+
+def test_lycd_dict_alloc():
+    """Per-group allocations are applied correctly."""
+    df = _make_df()
+    settings = _make_settings()
+    alloc = {"G1": 0.15, "G2": 0.25}
+    result = calc_lycd_land_values(df, settings, land_alloc=alloc)
+
+    g1_rows = result[result["model_group"] == "G1"]
+    g2_rows = result[result["model_group"] == "G2"]
+
+    assert (g1_rows["lycd_land_alloc"] - 0.15).abs().max() < 1e-10
+    assert (g2_rows["lycd_land_alloc"] - 0.25).abs().max() < 1e-10
+
+
+def test_lycd_auto_alloc_from_vacant():
+    """When land_alloc=None the allocation is derived from vacant vs improved
+    per-unit values within each group."""
+    df = _make_df()
+    settings = _make_settings()
+    result = calc_lycd_land_values(df, settings, land_alloc=None)
+
+    # --- Group G1 ---
+    # vacant:  A4 → 50_000 / 4_000 = 12.5  $/sqft  (only one, median = 12.5)
+    # improved per-unit medians:
+    #   A1: 200_000/5_000 = 40.0
+    #   A2: 250_000/6_000 ≈ 41.667
+    #   A3: 300_000/7_000 ≈ 42.857
+    #   median ≈ 41.667
+    # implied alloc = 12.5 / 41.667 ≈ 0.3000
+    g1_vacant_rate = 50_000 / 4_000
+    g1_improved_rates = sorted([200_000 / 5_000, 250_000 / 6_000, 300_000 / 7_000])
+    g1_improved_median = np.median(g1_improved_rates)
+    expected_g1_alloc = g1_vacant_rate / g1_improved_median
+
+    g1_rows = result[result["model_group"] == "G1"]
+    assert (g1_rows["lycd_land_alloc"] - expected_g1_alloc).abs().max() < 1e-6
+
+    # land values must be non-negative
+    assert (result["lycd_land_value"] >= 0).all()
+
+
+def test_lycd_clamping():
+    """Land value is clamped to [0, market_value]."""
+    # Construct a case where the raw rate would overshoot
+    df = pd.DataFrame({
+        "key":                ["X1", "X2"],
+        "model_group":        ["G1", "G1"],
+        "model_market_value": [100_000, 100_000],
+        "land_area_sqft":     [1_000, 100_000],   # huge lot → would overshoot
+        "is_vacant":          [False, False],
+    })
+    settings = _make_settings()
+    result = calc_lycd_land_values(df, settings, land_alloc=0.20)
+
+    # No land value may exceed the market value
+    assert (result["lycd_land_value"] <= result["model_market_value"] + 1e-6).all()
+    assert (result["lycd_land_value"] >= 0).all()
+
+
+def test_lycd_metric_units():
+    """Works with metric (sqm) settings."""
+    df = pd.DataFrame({
+        "key":                ["M1", "M2", "M3"],
+        "model_group":        ["G1", "G1", "G1"],
+        "model_market_value": [200_000, 300_000, 50_000],
+        "land_area_sqm":      [500, 700, 400],
+        "is_vacant":          [False, False, True],
+    })
+    settings = _make_settings(units="metric")
+    result = calc_lycd_land_values(df, settings, land_alloc=0.20)
+
+    # median improved: mv=250k, lot=600 sqm
+    # rate = (250_000 * 0.20) / 600
+    expected_rate = (250_000 * 0.20) / 600
+    assert (result["lycd_local_land_rate"] - expected_rate).abs().max() < 1e-6
+
+
+def test_lycd_no_vacant_falls_back_to_global():
+    """A group with no vacant properties falls back to the global allocation."""
+    df = pd.DataFrame({
+        "key":                ["A1", "A2", "B1", "B2", "B3"],
+        "model_group":        ["G1", "G1", "G2", "G2", "G2"],
+        "model_market_value": [200_000, 300_000, 400_000, 500_000, 80_000],
+        "land_area_sqft":     [5_000, 7_000, 10_000, 12_000, 8_000],
+        # G1 has no vacant; G2 has B3 as vacant
+        "is_vacant":          [False, False, False, False, True],
+    })
+    settings = _make_settings()
+    result = calc_lycd_land_values(df, settings, land_alloc=None)
+
+    # Both groups should receive a valid (non-NaN) allocation
+    assert result["lycd_land_alloc"].notna().all()
+    # The G1 allocation should equal the global allocation
+    g1_alloc = result[result["model_group"] == "G1"]["lycd_land_alloc"].iloc[0]
+    g2_alloc = result[result["model_group"] == "G2"]["lycd_land_alloc"].iloc[0]
+
+    # G2 has its own vacant data so it will differ; G1 must be the global fallback
+    # Global: all vacant = B3 (80k/8k=10), all improved = A1,A2,B1,B2
+    all_improved_rates = sorted([200_000/5_000, 300_000/7_000, 400_000/10_000, 500_000/12_000])
+    global_improved_median = np.median(all_improved_rates)
+    global_vacant_rate = 80_000 / 8_000
+    expected_global_alloc = global_vacant_rate / global_improved_median
+
+    assert abs(g1_alloc - expected_global_alloc) < 1e-6

From 73e2f705cb979ad566d2385bf50b7de5a4f6db2e Mon Sep 17 00:00:00 2001
From: Russell Richie <drussellmrichie@github.com>
Date: Tue, 31 Mar 2026 12:02:34 -0400
Subject: [PATCH 3/5] fix: use positional alignment in _contrib_to_unit_values
 and _add_prediction_to_contribution to prevent cartesian-product OOM

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 openavmkit/modeling.py | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/openavmkit/modeling.py b/openavmkit/modeling.py
index 88a6163f..9acd5291 100644
--- a/openavmkit/modeling.py
+++ b/openavmkit/modeling.py
@@ -6207,8 +6207,27 @@ def _contrib_to_unit_values(df_contrib: pd.DataFrame, df_base: pd.DataFrame, spl
     drop_from_base = [c for c in reserved if c != the_key and c in df_base.columns]
     df_base_trim = df_base.drop(columns=drop_from_base)
 
-    # merge
-    df_merged = df_contrib_renamed.merge(df_base_trim, on=the_key, how="left")
+    # Use positional alignment instead of a key-based merge.
+    # df_contrib is built row-by-row from df_base (same order), so positional
+    # alignment is correct and avoids the pandas non-unique-join path that
+    # produces a cartesian product when df_base has a non-sequential index
+    # (e.g. after sort_values) and an Arrow-backed string key dtype that
+    # triggers the slow path in pandas merge internals. On a ~421k-row
+    # universe this manifests as a 421k² OOM crash.
+    if len(df_contrib_renamed) != len(df_base_trim):
+        import warnings
+        warnings.warn(
+            f"_contrib_to_unit_values: contrib ({len(df_contrib_renamed)} rows) and "
+            f"base ({len(df_base_trim)} rows) have different sizes; "
+            f"falling back to key-based merge on '{the_key}'.",
+            UserWarning,
+        )
+        df_merged = df_contrib_renamed.merge(df_base_trim, on=the_key, how="left")
+    else:
+        df_contrib_aligned = df_contrib_renamed.reset_index(drop=True)
+        df_base_aligned = df_base_trim.reset_index(drop=True)
+        cols_to_add = [c for c in df_base_aligned.columns if c not in df_contrib_aligned.columns]
+        df_merged = pd.concat([df_contrib_aligned, df_base_aligned[cols_to_add]], axis=1)
 
     # compute per-unit contributions
     normal_vars = []
@@ -6246,7 +6265,17 @@ def _add_prediction_to_contribution(
 ):
     the_key = "key_sale" if "key_sale" in df.columns and "univ" not in split_name else "key"
     df_pred = df[[the_key, "prediction"]]
-    df_combined = df_contrib.merge(df_pred, on=the_key, how="left")
+    # Use positional alignment to avoid cartesian-product OOM (same root cause
+    # as _contrib_to_unit_values: df has a non-sequential index from
+    # sort_values, while df_contrib has 0..N-1, and an Arrow-backed string key
+    # dtype triggers the slow path in pandas merge internals).
+    if len(df_contrib) == len(df_pred):
+        df_combined = pd.concat(
+            [df_contrib.reset_index(drop=True), df_pred[["prediction"]].reset_index(drop=True)],
+            axis=1,
+        )
+    else:
+        df_combined = df_contrib.merge(df_pred, on=the_key, how="left")
     df_combined["check_delta"] = df_combined["prediction"] - df_combined["contribution_sum"]
     return df_combined
 

From 25e8c9091b9509d224d89176cd843b1337e7c879 Mon Sep 17 00:00:00 2001
From: Russell Richie <drussellmrichie@github.com>
Date: Fri, 3 Apr 2026 15:37:08 -0400
Subject: [PATCH 4/5] feat: add subarea_field support to LYCD land valuation

calc_lycd_land_values now accepts a subarea_field parameter (e.g.
"neighborhood", "census_tract") to derive a separate local land rate per
(subarea, model_group) cell instead of one rate per model group across the
entire jurisdiction. Cells below min_improved_per_cell fall back to the
model-group rate. _derive_lycd_alloc_from_data updated to match with
group_field, subarea_field, and alloc_by_group_fallback parameters.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 openavmkit/land.py | 209 +++++++++++++++++++++++++++++++++------------
 1 file changed, 154 insertions(+), 55 deletions(-)

diff --git a/openavmkit/land.py b/openavmkit/land.py
index f15a1c32..b9af6d09 100644
--- a/openavmkit/land.py
+++ b/openavmkit/land.py
@@ -795,18 +795,20 @@ def calc_lycd_land_values(
     settings: dict,
     land_alloc: "float | dict | None" = None,
     market_value_field: str = "model_market_value",
+    subarea_field: "str | None" = None,
+    min_improved_per_cell: int = 10,
 ) -> pd.DataFrame:
     """Compute land values using the "Least You Can Do" (LYCD) method.
 
-    For each model group this method:
+    For each local area this method:
 
     1. Takes the median market value and median lot size of **improved**
-       (non-vacant) properties in the group.
+       (non-vacant) properties in the area.
     2. Derives a uniform local land rate::
 
            local_land_rate = (median_market_value * land_alloc_pct) / median_lot_size
 
-    3. Applies that rate to every parcel::
+    3. Applies that rate to every parcel in the area::
 
            land_value = local_land_rate * parcel_lot_size
 
@@ -815,12 +817,18 @@ def calc_lycd_land_values(
     side-by-side disparities that arise from naively multiplying each parcel's
     market value by a fixed allocation fraction.
 
-    When ``land_alloc`` is ``None`` the allocation for each group is derived
-    automatically from the data.  For each group the median per-unit value of
-    **vacant** properties is divided by the median per-unit value of **improved**
-    properties; that ratio is the implied land allocation.  If a group has no
-    vacant properties the global ratio (across all groups combined) is used as a
-    fallback.
+    When ``subarea_field`` is provided, rates are computed per
+    ``(subarea, model_group)`` cell rather than per model group alone.  This
+    captures the sharp neighbourhood-to-neighbourhood price jumps that a single
+    county-wide rate per land-use type would miss.  Cells with fewer than
+    ``min_improved_per_cell`` improved parcels fall back first to the
+    model-group rate, then to the global rate.
+
+    When ``land_alloc`` is ``None`` the allocation for each cell is derived
+    automatically from the data.  The median per-unit value of **vacant**
+    properties is divided by the median per-unit value of **improved**
+    properties; that ratio is the implied land allocation.  Fallback order:
+    cell → model group → global.
 
     Parameters
     ----------
@@ -832,28 +840,41 @@ def calc_lycd_land_values(
         - ``market_value_field`` (default ``"model_market_value"``)
         - ``"is_vacant"`` (bool)
         - ``f"land_area_{unit}"`` where *unit* is ``"sqft"`` or ``"sqm"``
+        - ``subarea_field`` column, if provided
 
     settings : dict
         Settings dictionary (used to determine the area unit).
     land_alloc : float, dict, or None
         Fraction of market value attributable to land.
 
-        - **float** – applied uniformly to all model groups (e.g. ``0.20`` for 20 %).
-        - **dict** – ``{model_group_id: float}`` for per-group allocations.
+        - **float** – applied uniformly to all cells.
+        - **dict** – keyed by ``model_group`` (``{mg: float}``) for per-group
+          allocations; individual cells inherit their group's allocation.
         - **None** – derived automatically from the ratio of vacant-to-improved
-          per-unit market values within each group.
+          per-unit market values, at the finest available level.
 
     market_value_field : str
         Column name holding the market value estimate.  Defaults to
         ``"model_market_value"``.
+    subarea_field : str or None
+        Column name whose values define geographic subareas (e.g.
+        ``"neighborhood"``, ``"census_tract"``).  When supplied, one local land
+        rate is derived per ``(subarea, model_group)`` cell.  When ``None``
+        (default), a single rate is derived per ``model_group``, which is
+        equivalent to treating the entire county as one area per land-use type.
+    min_improved_per_cell : int
+        Minimum number of improved parcels required in a
+        ``(subarea, model_group)`` cell before that cell gets its own rate.
+        Cells below this threshold fall back to the model-group rate.
+        Ignored when ``subarea_field`` is ``None``.  Default: 10.
 
     Returns
     -------
     pd.DataFrame
         Copy of *df* with three additional columns:
 
-        - ``"lycd_land_alloc"`` – land allocation fraction used for the group.
-        - ``"lycd_local_land_rate"`` – implied per-area-unit land rate for the group.
+        - ``"lycd_land_alloc"`` – land allocation fraction used for the cell.
+        - ``"lycd_local_land_rate"`` – implied per-area-unit land rate for the cell.
         - ``"lycd_land_value"`` – resulting land value (clamped to ``[0, market_value]``).
     """
     unit = area_unit(settings)
@@ -862,39 +883,95 @@ def calc_lycd_land_values(
     df = df.copy()
     model_groups = df["model_group"].unique()
 
-    # Resolve land_alloc to a per-group dict
+    # ------------------------------------------------------------------
+    # Resolve land_alloc to a per-model-group dict (used as fallback even
+    # when subarea_field is set, and as the primary source when it is not).
+    # ------------------------------------------------------------------
     if isinstance(land_alloc, dict):
         alloc_by_group = {mg: land_alloc.get(mg, np.nan) for mg in model_groups}
     elif land_alloc is not None:
         alloc_by_group = {mg: float(land_alloc) for mg in model_groups}
     else:
-        alloc_by_group = _derive_lycd_alloc_from_data(df, lot_area_field, market_value_field)
+        alloc_by_group = _derive_lycd_alloc_from_data(
+            df, lot_area_field, market_value_field, group_field="model_group"
+        )
 
-    local_land_rates = {}
-    resolved_allocs = {}
+    # ------------------------------------------------------------------
+    # Helper: compute one local land rate for a slice of the dataframe.
+    # Returns np.nan if the slice is too small or the medians are invalid.
+    # ------------------------------------------------------------------
+    def _rate_for_mask(mask_improved, alloc):
+        sub = df[mask_improved]
+        if len(sub) == 0 or np.isnan(alloc):
+            return np.nan
+        median_mv = sub[market_value_field].median()
+        median_lot = sub[lot_area_field].median()
+        if median_lot <= 0 or np.isnan(median_lot) or np.isnan(median_mv):
+            return np.nan
+        return (median_mv * alloc) / median_lot
 
+    # ------------------------------------------------------------------
+    # Build per-model-group fallback rates (always needed).
+    # ------------------------------------------------------------------
+    group_rates = {}
     for mg in model_groups:
-        mask_improved = df["model_group"].eq(mg) & df["is_vacant"].eq(False)
-        df_improved = df[mask_improved]
-
-        alloc = alloc_by_group.get(mg, np.nan)
-        resolved_allocs[mg] = alloc
-
-        if len(df_improved) == 0 or np.isnan(alloc):
-            local_land_rates[mg] = np.nan
-            continue
-
-        median_mv = df_improved[market_value_field].median()
-        median_lot = df_improved[lot_area_field].median()
-
-        if median_lot <= 0 or np.isnan(median_lot) or np.isnan(median_mv):
-            local_land_rates[mg] = np.nan
-            continue
+        mask = df["model_group"].eq(mg) & df["is_vacant"].eq(False)
+        group_rates[mg] = _rate_for_mask(mask, alloc_by_group.get(mg, np.nan))
+
+    # ------------------------------------------------------------------
+    # Assign rates and allocs row-by-row into output columns.
+    # ------------------------------------------------------------------
+    df["lycd_land_alloc"] = np.nan
+    df["lycd_local_land_rate"] = np.nan
+
+    if subarea_field is None:
+        # No geographic subdivision: one rate per model group.
+        for mg in model_groups:
+            idx = df["model_group"].eq(mg)
+            df.loc[idx, "lycd_local_land_rate"] = group_rates[mg]
+            df.loc[idx, "lycd_land_alloc"] = alloc_by_group.get(mg, np.nan)
+    else:
+        # Geographic subdivision: one rate per (subarea, model_group) cell,
+        # with fallback to model-group rate for thin cells.
+        subareas = df[subarea_field].unique()
+
+        # Pre-derive per-cell allocations when land_alloc is None.
+        if land_alloc is None:
+            cell_allocs = _derive_lycd_alloc_from_data(
+                df,
+                lot_area_field,
+                market_value_field,
+                group_field="model_group",
+                subarea_field=subarea_field,
+                alloc_by_group_fallback=alloc_by_group,
+            )
+        else:
+            cell_allocs = {}  # will use alloc_by_group for every cell
+
+        for mg in model_groups:
+            mask_mg = df["model_group"].eq(mg)
+            for sa in subareas:
+                mask_sa = df[subarea_field].eq(sa)
+                mask_cell_improved = mask_mg & mask_sa & df["is_vacant"].eq(False)
+                n_improved = mask_cell_improved.sum()
+                idx = mask_mg & mask_sa
+
+                if n_improved >= min_improved_per_cell:
+                    cell_key = (sa, mg)
+                    alloc = cell_allocs.get(cell_key, alloc_by_group.get(mg, np.nan))
+                    rate = _rate_for_mask(mask_cell_improved, alloc)
+                    if np.isnan(rate):
+                        # Cell medians invalid despite enough rows; fall back.
+                        rate = group_rates[mg]
+                        alloc = alloc_by_group.get(mg, np.nan)
+                else:
+                    # Too few improved parcels in this cell; use group rate.
+                    rate = group_rates[mg]
+                    alloc = alloc_by_group.get(mg, np.nan)
 
-        local_land_rates[mg] = (median_mv * alloc) / median_lot
+                df.loc[idx, "lycd_local_land_rate"] = rate
+                df.loc[idx, "lycd_land_alloc"] = alloc
 
-    df["lycd_land_alloc"] = df["model_group"].map(resolved_allocs)
-    df["lycd_local_land_rate"] = df["model_group"].map(local_land_rates)
     df["lycd_land_value"] = df["lycd_local_land_rate"] * df[lot_area_field]
 
     # Clamp: must be >= 0 and <= market value
@@ -909,16 +986,16 @@ def _derive_lycd_alloc_from_data(
     df: pd.DataFrame,
     lot_area_field: str,
     market_value_field: str,
+    group_field: str = "model_group",
+    subarea_field: "str | None" = None,
+    alloc_by_group_fallback: "dict | None" = None,
 ) -> dict:
-    """Derive per-group land allocation fractions from vacant vs improved values.
-
-    For each model group the allocation is:
-
-        median(market_value / lot_area) for vacant parcels
-        ─────────────────────────────────────────────────
-        median(market_value / lot_area) for improved parcels
+    """Derive land allocation fractions from vacant vs improved per-unit values.
 
-    If a group has no vacant parcels the global ratio is used as a fallback.
+    When ``subarea_field`` is None, returns ``{model_group: alloc}``.
+    When ``subarea_field`` is provided, returns ``{(subarea, model_group): alloc}``,
+    falling back to the group-level allocation for cells with no vacant parcels,
+    and to the global allocation if the group also has none.
     """
     def _per_unit_median(mask):
         sub = df[mask].copy()
@@ -929,20 +1006,42 @@ def _per_unit_median(mask):
     # Global fallback
     global_vacant_rate = _per_unit_median(df["is_vacant"].eq(True))
     global_improved_rate = _per_unit_median(df["is_vacant"].eq(False))
-    if global_improved_rate > 0 and not np.isnan(global_vacant_rate):
+    if global_improved_rate > 0 and not (
+        np.isnan(global_vacant_rate) or np.isnan(global_improved_rate)
+    ):
         global_alloc = global_vacant_rate / global_improved_rate
     else:
         global_alloc = np.nan
 
-    result = {}
-    for mg in df["model_group"].unique():
-        mask_mg = df["model_group"].eq(mg)
-        vacant_rate = _per_unit_median(mask_mg & df["is_vacant"].eq(True))
-        improved_rate = _per_unit_median(mask_mg & df["is_vacant"].eq(False))
-
-        if improved_rate > 0 and not np.isnan(vacant_rate):
-            result[mg] = vacant_rate / improved_rate
+    # Per-group allocations (always computed; used as fallback for cells).
+    group_allocs = {}
+    for mg in df[group_field].unique():
+        mask_mg = df[group_field].eq(mg)
+        vr = _per_unit_median(mask_mg & df["is_vacant"].eq(True))
+        ir = _per_unit_median(mask_mg & df["is_vacant"].eq(False))
+        if ir > 0 and not (np.isnan(vr) or np.isnan(ir)):
+            group_allocs[mg] = vr / ir
         else:
-            result[mg] = global_alloc
+            group_allocs[mg] = (
+                alloc_by_group_fallback.get(mg, global_alloc)
+                if alloc_by_group_fallback
+                else global_alloc
+            )
+
+    if subarea_field is None:
+        return group_allocs
+
+    # Per-(subarea, group) allocations with group → global fallback.
+    result = {}
+    for mg in df[group_field].unique():
+        mask_mg = df[group_field].eq(mg)
+        for sa in df[subarea_field].unique():
+            mask_sa = df[subarea_field].eq(sa)
+            vr = _per_unit_median(mask_mg & mask_sa & df["is_vacant"].eq(True))
+            ir = _per_unit_median(mask_mg & mask_sa & df["is_vacant"].eq(False))
+            if ir > 0 and not (np.isnan(vr) or np.isnan(ir)):
+                result[(sa, mg)] = vr / ir
+            else:
+                result[(sa, mg)] = group_allocs.get(mg, global_alloc)
 
     return result
\ No newline at end of file

From 2e31553d24d112266fc55bf8f7c58cf54b0abfca Mon Sep 17 00:00:00 2001
From: Russell Richie <drussellmrichie@github.com>
Date: Mon, 13 Apr 2026 10:32:16 -0400
Subject: [PATCH 5/5] fix: accumulated compatibility patches for
 pyarrow/pandas/scipy compat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Split-checkpoint contributions strategy (benchmark, pipeline, modeling)
- Thread `do_contributions` flag through the full call chain: `run_models`,
  `_run_models`, `run_one_model`, `run_one_hedonic_model`, `_predict_one_model`,
  `_run_hedonic_models`, `_write_model_results`, `write_model_parameters`
- Add `_DS` and `_SMRContribContext` helper classes (module-level, picklable)
- `_write_model_results`: save `_smr_for_contribs.pkl` + `_model_features.json`
  when `do_contributions=False`; enables deferred SHAP pass checkpointing
- New `compute_model_contributions(settings)` in pipeline.py: loads saved
  `_smr_for_contribs.pkl` files per model group, runs contributions pass,
  deletes pkl; allows crash recovery without retraining
- `finalize_models`: expose `run_main/vacant/hedonic/ensemble`, `do_shaps`,
  `do_plots`, `do_contributions` as real parameters (were hardcoded True/False)
- `_run_models`: add per-group `ind_vars` override from
  `settings.modeling.models.<mvh>.group_overrides.<group>.ind_vars`

## Bug fixes
- `data.py`: `from scipy.spatial import cKDTree` (private submodule removed
  in scipy >= 1.14)
- `data.py`: move `astype(int)` cast to after `.loc` override in
  `_basic_geo_enrichment` — pandas 2.x rejects float→int64 via `.loc`
- `sales_scrutiny_study.py`: go through `.astype(object)` before `.astype(str)`
  to avoid ArrowNotImplementedError on null-dtype columns (pyarrow >= 22)
- `shap_analysis.py`: warn + filter instead of raising ValueError when
  `list_vars` contains features dropped by variable selection
- `tuning.py`: guard `_lightgbm_rolling_origin_cv` against n_samples < n_splits
- `utilities/cache.py`: replace `col.values == ...` with `col.eq(...)` and
  wrap `.sum()` in `int()` — ArrowExtensionArray has no `.values` sum
- `utilities/stats.py`: multiple `calc_correlations` guards for sparse/small
  groups — empty naive_corr early-exit, all-NA score break, `len(remaining)
  <= 1` early return with correct schema, `.columns.tolist()[0]` for
  Arrow-backed Index compat

## Misc
- `.gitignore`: add `.DS_Store` and `.pytest_cache/`

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitignore                         |   6 ++
 openavmkit/benchmark.py            | 124 +++++++++++++++++++++++++----
 openavmkit/data.py                 |   9 ++-
 openavmkit/modeling.py             |   7 +-
 openavmkit/pipeline.py             |  95 ++++++++++++++++++++--
 openavmkit/sales_scrutiny_study.py |   4 +-
 openavmkit/shap_analysis.py        |   7 +-
 openavmkit/tuning.py               |   2 +
 openavmkit/utilities/cache.py      |   6 +-
 openavmkit/utilities/stats.py      |  16 +++-
 10 files changed, 238 insertions(+), 38 deletions(-)

diff --git a/.gitignore b/.gitignore
index 80dc3a63..7f4190c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,3 +36,9 @@ tests/cache/
 cache/
 
 out/
+
+# OS
+.DS_Store
+
+# Testing
+.pytest_cache/
diff --git a/openavmkit/benchmark.py b/openavmkit/benchmark.py
index 6185707c..94cc9052 100644
--- a/openavmkit/benchmark.py
+++ b/openavmkit/benchmark.py
@@ -840,7 +840,8 @@ def run_models(
     run_hedonic: bool = True,
     run_ensemble: bool = True,
     do_shaps: bool = False,
-    do_plots: bool = False
+    do_plots: bool = False,
+    do_contributions: bool = True,
 ):
     """
     Runs predictive models on the given SalesUniversePair.
@@ -945,7 +946,8 @@ def run_models(
                 verbose,
                 run_ensemble,
                 do_shaps=do_shaps,
-                do_plots=do_plots
+                do_plots=do_plots,
+                do_contributions=do_contributions,
             )
             if mg_results is not None and save_results:
                 dict_all_results[model_group] = mg_results
@@ -1199,6 +1201,7 @@ def run_one_model(
     hedonic: bool = False,
     test_keys: list[str] | None = None,
     train_keys: list[str] | None = None,
+    do_contributions: bool = True,
 ) -> SingleModelResults | None:
     """
     Run a single model based on provided parameters and return its results.
@@ -1406,7 +1409,7 @@ def run_one_model(
         t.start("write")
         main_vacant_hedonic = "hedonic" if hedonic else "vacant" if vacant_only else "main"
         location = get_model_location(settings, main_vacant_hedonic, model_name)
-        _write_model_results(results, outpath, settings, location, verbose=verbose)
+        _write_model_results(results, outpath, settings, location, verbose=verbose, do_contributions=do_contributions)
         t.stop("write")
 
     return results
@@ -1428,6 +1431,7 @@ def run_one_hedonic_model(
     hedonic_test_against_vacant_sales: bool = True,
     save_results: bool = False,
     verbose: bool = False,
+    do_contributions: bool = True,
 ):
     """Run a single hedonic model based on provided parameters and return its results.
 
@@ -1513,6 +1517,7 @@ def run_one_hedonic_model(
         settings=settings,
         save_results=save_results,
         verbose=verbose,
+        do_contributions=do_contributions,
     )
     return results
 
@@ -1766,6 +1771,7 @@ def _predict_one_model(
     settings: dict,
     save_results: bool = False,
     verbose: bool = False,
+    do_contributions: bool = True,
 ) -> SingleModelResults:
     """
     Predict results for one model, using saved results if available.
@@ -1834,15 +1840,15 @@ def _predict_one_model(
         results = predict_slice(ds, slice_model, timing, verbose)
     
     if save_results:
-        
+
         mvh = settings.get("modeling", {}).get("models", {}).get(main_vacant_hedonic, {})
         model_entry = mvh.get("model_name", mvh.get("default", {}))
         location = model_entry.get("location", None)
         if location is None:
             location = get_important_field(settings, "loc_neighborhood")
-        
+
         location = get_model_location(settings, main_vacant_hedonic, model_name)
-        _write_model_results(results, outpath, settings, location, verbose=verbose)
+        _write_model_results(results, outpath, settings, location, verbose=verbose, do_contributions=do_contributions)
 
     return results
 
@@ -2130,13 +2136,61 @@ def _assemble_model_results(results: SingleModelResults, settings: dict):
     return dfs
 
 
-def _write_model_results(results: SingleModelResults, outpath: str, settings: dict, location: str = None, verbose:bool = False):
+class _DS:
+    """Minimal stand-in for DataSplit, used only to carry ind_vars and X matrices for the deferred contributions pass."""
+    def __init__(self, ind_vars, X_test, X_sales, X_univ):
+        self.ind_vars = ind_vars
+        self.X_test = X_test
+        self.X_sales = X_sales
+        self.X_univ = X_univ
+
+
+class _SMRContribContext:
+    """
+    Minimal context needed by write_model_parameters / write_shaps for the
+    contributions-only pass.  Holds the trained model, ind_vars, X matrices,
+    and the four DataFrames (geometry stripped to reduce pickle size).
+    """
+    def __init__(self, model, ind_vars, df_train, df_test, df_sales, df_universe):
+        self.model = model
+        # Subset to ind_vars only — avoid storing the full DataFrames twice
+        safe_ind_vars = [v for v in ind_vars if v in df_test.columns]
+        self.ds = _DS(
+            ind_vars=ind_vars,
+            X_test=df_test[safe_ind_vars],
+            X_sales=df_sales[safe_ind_vars],
+            X_univ=df_universe[safe_ind_vars],
+        )
+        self.df_train = df_train
+        self.df_test = df_test
+        self.df_sales = df_sales
+        self.df_universe = df_universe
+
+
+def _write_model_results(
+    results: SingleModelResults,
+    outpath: str,
+    settings: dict,
+    location: str = None,
+    verbose: bool = False,
+    do_contributions: bool = True,
+):
     """
     Write model results to disk in parquet and CSV formats.
+
+    Parameters
+    ----------
+    do_contributions : bool, optional
+        If True (default), compute and write SHAP contribution CSVs immediately.
+        If False, skip contributions and instead save a ``_smr_for_contribs.pkl``
+        file alongside the predictions so that
+        :func:`openavmkit.pipeline.compute_model_contributions` can run the
+        contributions pass separately (allowing it to be checkpointed
+        independently of model training).
     """
-    
+
     print(f"Write model results to {outpath}")
-    
+
     dfs = _assemble_model_results(results, settings)
     path = f"{outpath}/{results.model_name}"
     if "*" in path:
@@ -2144,11 +2198,11 @@ def _write_model_results(results: SingleModelResults, outpath: str, settings: di
     os.makedirs(path, exist_ok=True)
     for key in dfs:
         df = dfs[key]
-        
+
         if "geometry" in df.columns:
             df = gpd.GeoDataFrame(df, geometry="geometry", crs=getattr(df, "crs", None))
             df = ensure_geometries(df)
-        
+
         df.to_parquet(f"{path}/pred_{key}.parquet")
         if "geometry" in df:
             df = df.drop(columns=["geometry"])
@@ -2165,10 +2219,30 @@ def _write_model_results(results: SingleModelResults, outpath: str, settings: di
 
     with open(f"{path}/pred_universe.pkl", "wb") as f:
         pickle.dump(results.pred_univ, f, protocol=pickle.HIGHEST_PROTOCOL)
-    
+
     params_path = f"{path}"
-    
-    write_model_parameters(results.model, results, location, params_path, verbose=verbose)
+
+    if not do_contributions:
+        # Save the minimal context needed for the deferred contributions pass.
+        # Geometry columns are stripped to reduce pickle size; all other columns
+        # (needed by _contrib_to_unit_values) are preserved.
+        ctx = _SMRContribContext(
+            model=results.model,
+            ind_vars=results.ds.ind_vars,
+            df_train=results.df_train.drop(columns=["geometry"], errors="ignore"),
+            df_test=results.df_test.drop(columns=["geometry"], errors="ignore"),
+            df_sales=results.df_sales.drop(columns=["geometry"], errors="ignore"),
+            df_universe=results.df_universe.drop(columns=["geometry"], errors="ignore"),
+        )
+        smr_pkl_path = f"{path}/_smr_for_contribs.pkl"
+        with open(smr_pkl_path, "wb") as f:
+            pickle.dump(ctx, f, protocol=pickle.HIGHEST_PROTOCOL)
+        import json as _json
+        with open(f"{path}/_model_features.json", "w") as f:
+            _json.dump({"ind_vars": list(ctx.ds.ind_vars)}, f)
+        print(f"  Contributions deferred — saved context to {smr_pkl_path}")
+
+    write_model_parameters(results.model, results, location, params_path, verbose=verbose, do_contributions=do_contributions)
 
 
 def get_model_location(
@@ -3506,7 +3580,8 @@ def _run_hedonic_models(
     verbose: bool = False,
     save_results: bool = False,
     run_ensemble: bool = True,
-    do_plots: bool = False
+    do_plots: bool = False,
+    do_contributions: bool = True,
 ):
     """
     Run hedonic models and ensemble them, then update the benchmark.
@@ -4143,7 +4218,8 @@ def _run_models(
     verbose: bool = False,
     run_ensemble: bool = True,
     do_shaps: bool = False,
-    do_plots: bool = False
+    do_plots: bool = False,
+    do_contributions: bool = True,
 ):
     """
     Run models for a given model group and process ensemble results.
@@ -4200,6 +4276,18 @@ def _run_models(
 
     model_entries = settings_model.get("models").get(main_vacant_hedonic, {})
 
+    # Apply per-group ind_vars overrides from settings.modeling.models.<mvh>.group_overrides.<group>
+    _group_ind_vars = (
+        model_entries
+        .get("group_overrides", {})
+        .get(model_group, {})
+        .get("ind_vars", None)
+    )
+    if _group_ind_vars is not None:
+        import copy
+        model_entries = copy.deepcopy(model_entries)
+        model_entries.setdefault("default", {})["ind_vars"] = _group_ind_vars
+
     if models_to_run is None:
         models_to_run = list(model_entries.keys())
 
@@ -4299,6 +4387,7 @@ def _run_models(
             use_saved_params=use_saved_params,
             save_results=save_results,
             verbose=verbose,
+            do_contributions=do_contributions,
         )
         if results is not None:
             model_results[model_name] = results
@@ -4415,7 +4504,8 @@ def _run_models(
             verbose=verbose,
             save_results=save_results,
             run_ensemble=run_ensemble,
-            do_plots=do_plots
+            do_plots=do_plots,
+            do_contributions=do_contributions,
         )
         t.stop("run hedonic models")
 
diff --git a/openavmkit/data.py b/openavmkit/data.py
index e8e601b3..1d8296e7 100644
--- a/openavmkit/data.py
+++ b/openavmkit/data.py
@@ -27,7 +27,7 @@
 import pyarrow.parquet as pq
 import geopandas as gpd
 
-from scipy.spatial._ckdtree import cKDTree
+from scipy.spatial import cKDTree
 from shapely.geometry import Polygon
 from shapely.geometry import LineString
 from shapely.ops import unary_union
@@ -2866,13 +2866,14 @@ def _basic_geo_enrichment(
             gdf[f"land_area_{unit}"] = gdf[f"land_area_{unit}"].combine_first(
                 gdf[f"land_area_gis_{unit}"]
             )
-            gdf[f"land_area_{unit}"] = np.round(
-                gdf[f"land_area_{unit}"].combine_first(gdf[f"land_area_gis_{unit}"])
-            ).astype(int)
+            gdf[f"land_area_{unit}"] = gdf[f"land_area_{unit}"].combine_first(
+                gdf[f"land_area_gis_{unit}"]
+            )
             gdf.loc[
                 gdf[f"land_area_given_{unit}"].le(0) | gdf[f"land_area_given_{unit}"].isna(),
                 f"land_area_{unit}",
             ] = gdf[f"land_area_gis_{unit}"]
+            gdf[f"land_area_{unit}"] = np.round(gdf[f"land_area_{unit}"]).astype(int)
 
             # Calculate difference
             gdf[f"land_area_gis_delta_{unit}"] = gdf[f"land_area_gis_{unit}"] - gdf[f"land_area_{unit}"]
diff --git a/openavmkit/modeling.py b/openavmkit/modeling.py
index 9acd5291..f1c70123 100644
--- a/openavmkit/modeling.py
+++ b/openavmkit/modeling.py
@@ -6286,9 +6286,12 @@ def write_model_parameters(
     location: str,
     outpath: str,
     do_plot: bool = False,
-    verbose: bool = False
+    verbose: bool = False,
+    do_contributions: bool = True,
 ):
-    
+    if not do_contributions:
+        return
+
     print(f"write model parameters to {outpath}")
     xs = {
         "test": smr.ds.X_test,
diff --git a/openavmkit/pipeline.py b/openavmkit/pipeline.py
index a179f8eb..be854a74 100644
--- a/openavmkit/pipeline.py
+++ b/openavmkit/pipeline.py
@@ -1656,6 +1656,13 @@ def finalize_models(
     save_params: bool = True,
     use_saved_params: bool = True,
     verbose: bool = False,
+    run_main: bool = True,
+    run_vacant: bool = True,
+    run_hedonic: bool = True,
+    run_ensemble: bool = True,
+    do_shaps: bool = False,
+    do_plots: bool = False,
+    do_contributions: bool = True,
 ) -> None:
     """
     Tries out predictive models on the given SalesUniversePair, finalizes results and writes to disk.
@@ -1680,6 +1687,18 @@ def finalize_models(
         Whether to use saved model parameters.
     verbose : bool, optional
         If True, prints additional information.
+    run_main : bool, optional
+        Whether to run main (improved) models.
+    run_vacant : bool, optional
+        Whether to run vacant land models.
+    run_hedonic : bool, optional
+        Whether to run hedonic (land/improvement split) models.
+    run_ensemble : bool, optional
+        Whether to run ensemble models.
+    do_shaps : bool, optional
+        Whether to compute SHAP values.
+    do_plots : bool, optional
+        Whether to generate plots.
 
     Returns
     -------
@@ -1694,15 +1713,79 @@ def finalize_models(
         use_saved_params,
         save_results=True,
         verbose=verbose,
-        run_main=True,
-        run_vacant=True,
-        run_hedonic=True,
-        run_ensemble=True,
-        do_shaps=False,
-        do_plots=False
+        run_main=run_main,
+        run_vacant=run_vacant,
+        run_hedonic=run_hedonic,
+        run_ensemble=run_ensemble,
+        do_shaps=do_shaps,
+        do_plots=do_plots,
+        do_contributions=do_contributions,
     )
 
 
+def compute_model_contributions(settings: dict, verbose: bool = False) -> None:
+    """
+    Compute SHAP contribution CSVs from saved model artifacts.
+
+    This is the second half of a split checkpoint strategy.  Call
+    :func:`finalize_models` with ``do_contributions=False`` first (wrapped in
+    a ``3-model-02a-train-predict`` checkpoint) to train models and save
+    predictions.  Then call this function (wrapped in a
+    ``3-model-02b-contributions`` checkpoint) to compute contributions
+    separately, so that a crash during the slow SHAP pass does not require
+    retraining.
+
+    Looks for ``_smr_for_contribs.pkl`` files written by
+    :func:`_write_model_results` and calls
+    :func:`openavmkit.modeling.write_model_parameters` for each one, then
+    deletes the pickle.
+
+    Parameters
+    ----------
+    settings : dict
+        The settings dictionary (used to determine model location).
+    verbose : bool, optional
+        If True, prints additional information.
+    """
+    import glob
+    import pickle
+    import os
+
+    from openavmkit.modeling import write_model_parameters
+    from openavmkit.benchmark import get_model_location
+
+    pkl_files = sorted(glob.glob("out/models/*/*/*/_smr_for_contribs.pkl"))
+    if not pkl_files:
+        print("compute_model_contributions: no _smr_for_contribs.pkl files found — nothing to do.")
+        return
+
+    for pkl_path in pkl_files:
+        # path is e.g. out/models/residential_sf/main/lightgbm/_smr_for_contribs.pkl
+        parts = pkl_path.replace("\\", "/").split("/")
+        model_group = parts[-4]   # e.g. residential_sf
+        mvh = parts[-3]           # e.g. main / vacant
+        model_name = parts[-2]    # e.g. lightgbm
+        outpath = "/".join(parts[:-1])  # directory containing the pkl
+
+        print(f"  Computing contributions: {model_group}/{mvh}/{model_name}")
+
+        with open(pkl_path, "rb") as f:
+            ctx = pickle.load(f)
+
+        location = get_model_location(settings, mvh, model_name)
+        write_model_parameters(
+            ctx.model,
+            ctx,
+            location,
+            outpath,
+            verbose=verbose,
+            do_contributions=True,
+        )
+
+        os.remove(pkl_path)
+        print(f"  Done — removed {pkl_path}")
+
+
 def run_models(
     sup: SalesUniversePair,
     settings: dict,
diff --git a/openavmkit/sales_scrutiny_study.py b/openavmkit/sales_scrutiny_study.py
index d981bc71..49da1d1b 100644
--- a/openavmkit/sales_scrutiny_study.py
+++ b/openavmkit/sales_scrutiny_study.py
@@ -111,8 +111,8 @@ def __init__(self, df: pd.DataFrame, settings: dict, model_group: str):
         for key in stuff:
             df = stuff[key]
             df, cluster_fields = _mark_sales_scrutiny_clusters(df, settings)
-            df["ss_id"] = df["ss_id"].astype(str)
-            df["ss_id"] = df["model_group"] + "_" + key + "_" + df["ss_id"]
+            df["ss_id"] = df["ss_id"].astype(object).fillna("").astype(str)
+            df["ss_id"] = df["model_group"].astype(object).astype(str) + "_" + key + "_" + df["ss_id"]
             per_area = ""
             denominator = ""
             if key == "i":
diff --git a/openavmkit/shap_analysis.py b/openavmkit/shap_analysis.py
index ce4302ee..edc6baa1 100644
--- a/openavmkit/shap_analysis.py
+++ b/openavmkit/shap_analysis.py
@@ -213,7 +213,12 @@ def make_shap_table(
         existing = list(expl.feature_names)
         missing = [c for c in list_vars if c not in existing]
         if missing:
-            raise ValueError(f"These list_vars are missing from explanation features: {missing}")
+            import warnings
+            warnings.warn(
+                f"These list_vars are missing from explanation features "
+                f"(likely dropped by variable selection): {missing}"
+            )
+            list_vars = [c for c in list_vars if c in existing]
         feature_cols = list_vars  # enforce this order
 
     df_features = pd.DataFrame(vals, columns=expl.feature_names)
diff --git a/openavmkit/tuning.py b/openavmkit/tuning.py
index 1e7706ec..74ecb45c 100644
--- a/openavmkit/tuning.py
+++ b/openavmkit/tuning.py
@@ -485,6 +485,8 @@ def _catboost_rolling_origin_cv(
 
 
 def _lightgbm_rolling_origin_cv(X, y, params, n_splits=5, random_state=42, cat_vars=None):
+    if len(X) < n_splits:
+        return float("inf")
     kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
     mape_scores = []
 
diff --git a/openavmkit/utilities/cache.py b/openavmkit/utilities/cache.py
index d7301e26..43accfdf 100644
--- a/openavmkit/utilities/cache.py
+++ b/openavmkit/utilities/cache.py
@@ -270,11 +270,11 @@ def write_cached_df(
 
             is_different = False
             if len(col_new) == len(col_orig):
-                values_equal = col_new.values == col_orig.values
+                values_equal = col_new.eq(col_orig)
                 na_equal = col_new.isna() & col_orig.isna()
 
-                count_na_equal = na_equal.sum()
-                count_values_equal = values_equal.sum()
+                count_na_equal = int(na_equal.sum())
+                count_values_equal = int(values_equal.sum())
 
                 count_to_match = len(col_new)
 
diff --git a/openavmkit/utilities/stats.py b/openavmkit/utilities/stats.py
index 75303d3e..f76439ab 100644
--- a/openavmkit/utilities/stats.py
+++ b/openavmkit/utilities/stats.py
@@ -694,8 +694,11 @@ def calc_correlations(
     while True:
         naive_corr = corr_full.loc[remaining, remaining]
 
+        if naive_corr.empty or len(naive_corr.columns) == 0:
+            break
+
         # --- NEW: compute one explicit order and apply to both axes
-        order = _order_by_target_corr(naive_corr, target_col=naive_corr.columns[0])
+        order = _order_by_target_corr(naive_corr, target_col=naive_corr.columns.tolist()[0])
         naive_corr = naive_corr.loc[order, order]
         # -----------------------------------------------
 
@@ -713,6 +716,8 @@ def calc_correlations(
 
         score = strength * clarity * clarity
 
+        if score.isna().all():
+            break
         min_score_idx = score.idxmin()
         try:
             min_score = score[min_score_idx]
@@ -736,6 +741,11 @@ def calc_correlations(
     # drop all other variables that are more correlated with current_variable than
     # they are with the target.
     # ------------------------------------------------------------------
+    if len(remaining) <= 1:
+        # Only target variable (or nothing) remains — skip second pass
+        _empty = pd.DataFrame(columns=["variable", "corr_strength", "corr_clarity", "corr_score"])
+        _stub = first_run if first_run is not None else _empty
+        return {"initial": _stub, "final": _empty, "bad_vars": bad_vars}
     target_col = remaining[0]
 
     # process in descending strength-to-target order (excluding target itself)
@@ -777,7 +787,7 @@ def calc_correlations(
 
     # Recompute scores for the *final* remaining set (same math as above)
     naive_corr = corr_full.loc[remaining, remaining]
-    order = _order_by_target_corr(naive_corr, target_col=naive_corr.columns[0])
+    order = _order_by_target_corr(naive_corr, target_col=naive_corr.columns.tolist()[0])
     naive_corr = naive_corr.loc[order, order]
 
     naive_corr_sans_target = naive_corr.iloc[1:, 1:]
@@ -803,7 +813,7 @@ def calc_correlations(
     final_corr = corr_full.loc[remaining, remaining]
 
     # --- NEW: order final_corr the same way (by target corr within remaining set)
-    final_order = _order_by_target_corr(final_corr, target_col=final_corr.columns[0])
+    final_order = _order_by_target_corr(final_corr, target_col=final_corr.columns.tolist()[0])
     final_corr = final_corr.loc[final_order, final_order]
     # -----------------------------------------------