From fbcad5a08d0d8b0abc901f69fb7781d83a8576ed Mon Sep 17 00:00:00 2001
From: Russell Richie <drussellmrichie@github.com>
Date: Tue, 14 Apr 2026 12:16:36 -0400
Subject: [PATCH] fix(tuning): cap LightGBM num_leaves/min_data_in_leaf search
 space for thin datasets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With small model groups (e.g. <200 training samples), the Optuna tuner can select
num_leaves values in the thousands — severe memorisation that yields artificially low
CV MAPE but collapses out-of-sample performance. For example, with ~101 training
samples the tuner found num_leaves=1514, degrading ratio-study COD from ~40 to ~55.

Fix: before building the search space, compute n_train_per_fold ≈ n * (k-1)/k and
cap num_leaves at max(8, n_train_per_fold // 4) and min_data_in_leaf at
max(2, n_train_per_fold // 4). For large datasets (n >> 8192) the caps are above
the original upper bounds and have no effect. For thin datasets the caps prevent the
tuner from selecting tree complexities that cannot generalise.

A verbose warning is printed when the cap takes effect.
---
 openavmkit/tuning.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/openavmkit/tuning.py b/openavmkit/tuning.py
index 1e7706ec..2aae2145 100644
--- a/openavmkit/tuning.py
+++ b/openavmkit/tuning.py
@@ -121,6 +121,20 @@ def _tune_lightgbm(
         dict: Best hyperparameters found by Optuna.
     """
 
+    # Bound search space by training-fold size to prevent memorisation on thin datasets.
+    # Each CV fold trains on roughly (n_splits-1)/n_splits of the data.
+    n_train_per_fold = int(len(X) * (n_splits - 1) / n_splits)
+    # num_leaves: cap at n_train_per_fold // 4 so each leaf covers ~4+ samples on average.
+    # This prevents the tuner from selecting thousands of leaves from a few hundred rows.
+    max_num_leaves = max(8, min(2048, n_train_per_fold // 4))
+    # min_data_in_leaf: upper bound must not exceed training fold size or every split is illegal.
+    max_min_data_in_leaf = max(2, min(500, n_train_per_fold // 4))
+    if verbose and max_num_leaves < 64:
+        print(
+            f"  [tune_lightgbm] thin dataset (n_train_per_fold={n_train_per_fold}): "
+            f"num_leaves capped at {max_num_leaves}, min_data_in_leaf capped at {max_min_data_in_leaf}"
+        )
+
     def objective(trial):
         """Objective function for Optuna to optimize LightGBM hyperparameters."""
         params = {
@@ -132,12 +146,12 @@ def objective(trial):
                 "learning_rate", 0.0001, 0.1, log=True
             ),
             "max_bin": trial.suggest_int("max_bin", 64, 1024),
-            "num_leaves": trial.suggest_int("num_leaves", 64, 2048),
+            "num_leaves": trial.suggest_int("num_leaves", min(64, max_num_leaves), max_num_leaves),
             "max_depth": trial.suggest_int("max_depth", 5, 15),
             "min_gain_to_split": trial.suggest_float(
                 "min_gain_to_split", 1e-4, 50, log=True
             ),
-            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 500),
+            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", min(20, max_min_data_in_leaf), max_min_data_in_leaf),
             "feature_fraction": trial.suggest_float(
                 "feature_fraction", 0.4, 0.9, log=False
             ),