From fbcad5a08d0d8b0abc901f69fb7781d83a8576ed Mon Sep 17 00:00:00 2001 From: Russell Richie Date: Tue, 14 Apr 2026 12:16:36 -0400 Subject: [PATCH] fix(tuning): cap LightGBM num_leaves/min_data_in_leaf search space for thin datasets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With small model groups (e.g. <200 training samples), the Optuna tuner can select num_leaves values in the thousands — severe memorisation that yields artificially low CV MAPE but collapses out-of-sample performance. For example, with ~101 training samples the tuner found num_leaves=1514, degrading ratio-study COD from ~40 to ~55. Fix: before building the search space, compute n_train_per_fold ≈ n * (k-1)/k and cap num_leaves at max(8, n_train_per_fold // 4) and min_data_in_leaf at max(2, n_train_per_fold // 4). For large datasets (n >> 8192) the caps are above the original upper bounds and have no effect. For thin datasets the caps prevent the tuner from selecting tree complexities that cannot generalise. A verbose warning is printed when the cap takes effect. --- openavmkit/tuning.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/openavmkit/tuning.py b/openavmkit/tuning.py index 1e7706ec..2aae2145 100644 --- a/openavmkit/tuning.py +++ b/openavmkit/tuning.py @@ -121,6 +121,20 @@ def _tune_lightgbm( dict: Best hyperparameters found by Optuna. """ + # Bound search space by training-fold size to prevent memorisation on thin datasets. + # Each CV fold trains on roughly (n_splits-1)/n_splits of the data. + n_train_per_fold = int(len(X) * (n_splits - 1) / n_splits) + # num_leaves: cap at n_train_per_fold // 4 so each leaf covers ~4+ samples on average. + # This prevents the tuner from selecting thousands of leaves from a few hundred rows. + max_num_leaves = max(8, min(2048, n_train_per_fold // 4)) + # min_data_in_leaf: upper bound must not exceed training fold size or every split is illegal. + max_min_data_in_leaf = max(2, min(500, n_train_per_fold // 4)) + if verbose and max_num_leaves < 64: + print( + f" [tune_lightgbm] thin dataset (n_train_per_fold={n_train_per_fold}): " + f"num_leaves capped at {max_num_leaves}, min_data_in_leaf capped at {max_min_data_in_leaf}" + ) + def objective(trial): """Objective function for Optuna to optimize LightGBM hyperparameters.""" params = { @@ -132,12 +146,12 @@ def objective(trial): "learning_rate", 0.0001, 0.1, log=True ), "max_bin": trial.suggest_int("max_bin", 64, 1024), - "num_leaves": trial.suggest_int("num_leaves", 64, 2048), + "num_leaves": trial.suggest_int("num_leaves", min(64, max_num_leaves), max_num_leaves), "max_depth": trial.suggest_int("max_depth", 5, 15), "min_gain_to_split": trial.suggest_float( "min_gain_to_split", 1e-4, 50, log=True ), - "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 500), + "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", min(20, max_min_data_in_leaf), max_min_data_in_leaf), "feature_fraction": trial.suggest_float( "feature_fraction", 0.4, 0.9, log=False ),