From a993ff3a315bcaa1baa38c37c4351ada1b3eccb3 Mon Sep 17 00:00:00 2001
From: Russell Richie <drussellmrichie@github.com>
Date: Fri, 27 Mar 2026 13:19:11 -0400
Subject: [PATCH] fix: guard against n_splits > n_samples in rolling-origin CV

When a model group has very few training samples (e.g. sparse commercial
or post-valuation sub-models), KFold raises ValueError if n_splits
exceeds the number of rows. Cap n_splits at len(X) and return a penalty
MAPE of 1.0 when fewer than 2 samples are available, so Optuna can
still complete gracefully.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 openavmkit/tuning.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/openavmkit/tuning.py b/openavmkit/tuning.py
index 1e7706e..2c4c594 100644
--- a/openavmkit/tuning.py
+++ b/openavmkit/tuning.py
@@ -485,6 +485,16 @@ def _catboost_rolling_origin_cv(
 
 
 def _lightgbm_rolling_origin_cv(X, y, params, n_splits=5, random_state=42, cat_vars=None):
+    n_samples = len(X)
+    n_splits = min(n_splits, n_samples)
+    if n_splits < 2:
+        import warnings
+        warnings.warn(
+            f"Not enough samples ({n_samples}) for cross-validation with n_splits={n_splits}. "
+            "Returning penalty MAPE of 1.0.",
+            UserWarning,
+        )
+        return 1.0
     kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
     mape_scores = []