From 18abe59e92caabd112c8606fbf44a9d9da2800e6 Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 02:35:53 +0000 Subject: [PATCH 01/12] feat: add probability-aware metric support across search and evaluation --- Makefile | 27 ++++ plexe/CODE_INDEX.md | 12 +- plexe/agents/baseline_builder.py | 33 +++- plexe/agents/model_evaluator.py | 12 +- plexe/helpers.py | 59 +++++++- plexe/search/evolutionary_search_policy.py | 35 +++-- plexe/search/journal.py | 51 +++++-- plexe/search/tree_policy.py | 6 +- .../templates/inference/catboost_predictor.py | 33 ++++ plexe/templates/inference/keras_predictor.py | 71 +++++++-- .../templates/inference/lightgbm_predictor.py | 33 ++++ .../templates/inference/pytorch_predictor.py | 5 + .../templates/inference/xgboost_predictor.py | 33 ++++ plexe/tools/submission.py | 66 ++++++-- plexe/workflow.py | 10 +- tests/CODE_INDEX.md | 92 ++++++++++- .../agents/test_model_evaluator_prompt.py | 23 +++ .../test_evolutionary_policy_determinism.py | 12 ++ tests/unit/search/test_journal.py | 37 +++++ .../search/test_tree_policy_determinism.py | 14 ++ .../test_baseline_probability_validation.py | 63 ++++++++ tests/unit/test_catboost_predictor.py | 45 ++++++ tests/unit/test_helpers.py | 143 +++++++++++++++++- tests/unit/test_keras_predictor.py | 79 ++++++++++ tests/unit/test_lightgbm_predictor.py | 39 +++++ tests/unit/test_pytorch_predictor.py | 50 ++++++ tests/unit/test_xgboost_predictor.py | 60 ++++++++ 27 files changed, 1074 insertions(+), 69 deletions(-) create mode 100644 tests/unit/agents/test_model_evaluator_prompt.py create mode 100644 tests/unit/test_baseline_probability_validation.py create mode 100644 tests/unit/test_catboost_predictor.py create mode 100644 tests/unit/test_keras_predictor.py create mode 100644 tests/unit/test_pytorch_predictor.py create mode 100644 tests/unit/test_xgboost_predictor.py diff --git a/Makefile b/Makefile index 4557d82a..59731544 100644 --- a/Makefile +++ b/Makefile @@ -45,6 +45,7 @@ help: @echo "" @echo "📊 Example Datasets:" @echo " make run-titanic Run on Titanic dataset (medium)" + @echo " make run-titanic-proba Run Titanic with probability-focused intent" @echo " make run-house-prices Run on House Prices dataset (regression)" @echo "" @echo "🏗️ Building:" @@ -331,6 +332,32 @@ run-titanic: build --spark-mode local \ --enable-final-evaluation +# Spaceship Titanic dataset with probability-focused objective +.PHONY: run-titanic-proba +run-titanic-proba: build + @echo "📊 Running on Spaceship Titanic dataset (probability-focused)..." + $(eval TIMESTAMP := $(shell date +%Y%m%d_%H%M%S)) + docker run --rm \ + --add-host=host.docker.internal:host-gateway \ + $(CONFIG_MOUNT) \ + $(CONFIG_ENV) \ + -v $(PWD)/examples/datasets:/data:ro \ + -v $(PWD)/workdir:/workdir \ + -e OPENAI_API_KEY=$(OPENAI_API_KEY) \ + -e ANTHROPIC_API_KEY=$(ANTHROPIC_API_KEY) \ + -e SPARK_LOCAL_CORES=4 \ + -e SPARK_DRIVER_MEMORY=4g \ + plexe:py$(PYTHON_VERSION) \ + python -m plexe.main \ + --train-dataset-uri /data/spaceship-titanic/train.parquet \ + --user-id dev_user \ + --intent "predict each passenger's probability of being transported; optimize probability quality and ranking" \ + --experiment-id titanic_proba \ + --max-iterations 5 \ + --work-dir /workdir/titanic_proba/$(TIMESTAMP) \ + --spark-mode local \ + --enable-final-evaluation + # House Prices dataset (regression) .PHONY: run-house-prices run-house-prices: build diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md index 2211af1d..19a31ee6 100644 --- a/plexe/CODE_INDEX.md +++ b/plexe/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: plexe -> Generated on 2026-03-03 00:06:47 +> Generated on 2026-03-03 02:35:53 Code structure and public interface documentation for the **plexe** package. @@ -222,6 +222,8 @@ Helper functions for workflow. **Functions:** - `select_viable_model_types(data_layout: DataLayout, selected_frameworks: list[str] | None) -> list[str]` - Select viable model types using three-tier filtering. +- `metric_requires_probabilities(metric_name: str) -> bool` - Return True when a metric requires probability scores instead of hard labels. +- `normalize_probability_predictions(y_true: np.ndarray, y_pred_proba: Any, metric_name: str) -> np.ndarray` - Normalize probability predictions for sklearn metric compatibility. - `evaluate_on_sample(spark: SparkSession, sample_uri: str, model_artifacts_path: Path, model_type: str, metric: str, target_columns: list[str], group_column: str | None, train_sample_uri: str | None) -> tuple[float, float | None]` - Evaluate model on validation sample, optionally also on training sample. - `compute_metric_hardcoded(y_true, y_pred, metric_name: str) -> float` - Compute metric using hardcoded sklearn implementations. - `compute_metric(y_true, y_pred, metric_name: str, group_ids) -> float` - Compute metric value. @@ -393,7 +395,10 @@ Insight store for accumulating learnings from search. Search journal for tracking model search tree. **`SearchJournal`** - Tracks solution search tree. -- `__init__(self, baseline: Baseline | None)` +- `__init__(self, baseline: Baseline | None, optimization_direction: str)` +- `selection_score(self, value: float) -> float` - Normalize a metric value so larger always means better. +- `is_better(self, candidate: float, reference: float | None) -> bool` - Compare two metric values using the configured optimization direction. +- `sort_key(self, node: Solution) -> float` - Direction-aware sort key for solution nodes. - `add_node(self, node: Solution) -> None` - Add a solution to the journal. - `draft_nodes(self) -> list[Solution]` - Get all root nodes (bootstrap solutions without parents). - `buggy_nodes(self) -> list[Solution]` - Get all buggy nodes that could be debugged. @@ -447,6 +452,7 @@ Standard CatBoost predictor - NO Plexe dependencies. **`CatBoostPredictor`** - Standalone CatBoost predictor. - `__init__(self, model_dir: str)` - `predict(self, x: pd.DataFrame) -> pd.DataFrame` - Make predictions on input DataFrame. +- `predict_proba(self, x: pd.DataFrame) -> pd.DataFrame` - Predict per-class probabilities on input DataFrame. --- ## `templates/inference/keras_predictor.py` @@ -464,6 +470,7 @@ Standard LightGBM predictor - NO Plexe dependencies. **`LightGBMPredictor`** - Standalone LightGBM predictor. - `__init__(self, model_dir: str)` - `predict(self, x: pd.DataFrame) -> pd.DataFrame` - Make predictions on input DataFrame. +- `predict_proba(self, x: pd.DataFrame) -> pd.DataFrame` - Predict per-class probabilities on input DataFrame. --- ## `templates/inference/pytorch_predictor.py` @@ -481,6 +488,7 @@ Standard XGBoost predictor - NO Plexe dependencies. **`XGBoostPredictor`** - Standalone XGBoost predictor. - `__init__(self, model_dir: str)` - `predict(self, x: pd.DataFrame) -> pd.DataFrame` - Make predictions on input DataFrame. +- `predict_proba(self, x: pd.DataFrame) -> pd.DataFrame` - Predict per-class probabilities on input DataFrame. --- ## `templates/packaging/model_card_template.py` diff --git a/plexe/agents/baseline_builder.py b/plexe/agents/baseline_builder.py index 580ccfcf..81163f6b 100644 --- a/plexe/agents/baseline_builder.py +++ b/plexe/agents/baseline_builder.py @@ -17,6 +17,7 @@ from plexe.config import Config, get_routing_for_model from plexe.constants import DirNames +from plexe.helpers import compute_metric, metric_requires_probabilities, normalize_probability_predictions from plexe.models import BuildContext, Baseline from plexe.utils.tracing import agent_span from plexe.tools.submission import ( @@ -54,6 +55,13 @@ def _build_agent(self, val_sample_df) -> CodeAgent: from plexe.agents.utils import format_user_feedback_for_prompt feedback_section = format_user_feedback_for_prompt(self.context.scratch.get("_user_feedback")) + requires_proba = metric_requires_probabilities(self.context.metric.name) + proba_requirement = ( + "6. Because the selected primary metric requires probabilities, your class MUST also implement\n" + " `predict_proba(self, x: pd.DataFrame) -> np.ndarray | pd.DataFrame` that returns per-sample scores.\n" + if requires_proba + else "" + ) # Get routing configuration for this agent's model api_base, headers = get_routing_for_model(self.config.routing_config, self.llm_model) @@ -124,6 +132,7 @@ def _build_agent(self, val_sample_df) -> CodeAgent: "3. Instantiate and validate: validate_baseline_predictor(predictor, name, description)\n" "4. Once the predictor is successfully validated, save its code as string: save_baseline_code(code_string)\n" "5. Call final_answer() with rationale\n" + f"{proba_requirement}" "\n" "## CRITICAL:\n" "- Use task_analysis['output_targets'] to identify target column(s)\n" @@ -164,6 +173,11 @@ def run(self) -> Baseline: task_type = self.context.task_analysis.get("task_type", "unknown") target_columns = self.context.output_targets metric_name = self.context.metric.name + proba_note = ( + "Primary metric requires probabilities, so baseline must implement predict_proba(X)." + if metric_requires_probabilities(metric_name) + else "Primary metric uses label/value predictions." + ) task = ( f"Create a simple baseline predictor for this ML task.\n\n" @@ -171,6 +185,7 @@ def run(self) -> Baseline: f"Task Type: {task_type}\n" f"Target Column(s): {target_columns}\n" f"Metric: {metric_name}\n\n" + f"{proba_note}\n\n" f"Build ONE simple baseline (heuristics preferred) that makes sense for this ML task. " f"Register it and evaluate performance using the tools provided." ) @@ -224,8 +239,6 @@ def _evaluate_performance(self, val_sample_df) -> float: Returns: Performance metric value """ - from plexe.helpers import compute_metric - # Separate features from target target_cols = self.context.output_targets feature_cols = [col for col in val_sample_df.columns if col not in target_cols] @@ -233,11 +246,21 @@ def _evaluate_performance(self, val_sample_df) -> float: X_val = val_sample_df[feature_cols] y_val = val_sample_df[target_cols[0]] - # Make predictions - y_pred = self.context.baseline_predictor.predict(X_val) + if metric_requires_probabilities(self.context.metric.name): + if not hasattr(self.context.baseline_predictor, "predict_proba") or not callable( + self.context.baseline_predictor.predict_proba + ): + raise ValueError( + f"Metric '{self.context.metric.name}' requires probabilities but baseline predictor " + "does not implement predict_proba()." + ) + raw_proba = self.context.baseline_predictor.predict_proba(X_val) + y_pred_input = normalize_probability_predictions(y_val.values, raw_proba, self.context.metric.name) + else: + y_pred_input = self.context.baseline_predictor.predict(X_val) # Compute metric - performance = compute_metric(y_true=y_val.values, y_pred=y_pred, metric_name=self.context.metric.name) + performance = compute_metric(y_true=y_val.values, y_pred=y_pred_input, metric_name=self.context.metric.name) logger.info(f"Baseline performance: {self.context.metric.name}={performance:.4f}") diff --git a/plexe/agents/model_evaluator.py b/plexe/agents/model_evaluator.py index 7c098e34..bca184b9 100644 --- a/plexe/agents/model_evaluator.py +++ b/plexe/agents/model_evaluator.py @@ -95,8 +95,11 @@ def _build_agent(self, phase_name: str, phase_prompt: str, tools: list) -> CodeA "- primary_metric_name: Name of the primary optimization metric (string)\\n" "- output_targets: list[str] (target column names to exclude from features)\\n\\n" "PREDICTOR INTERFACE:\\n" - "The predictor's predict() function takes a pandas DataFrame (features only, no target)\\n" - "and returns a pandas DataFrame with a 'prediction' column.\\n\\n" + "- predict(X): input is features-only pandas DataFrame (no target columns), returns DataFrame with 'prediction'\\n" + "- predict_proba(X): input is features-only pandas DataFrame; classification only; returns per-class probabilities\\n\\n" + "CRITICAL METRIC RULE:\\n" + "If a metric is probability-based (roc_auc, roc_auc_ovr, roc_auc_ovo, log_loss),\\n" + "you MUST compute it from predict_proba() outputs, not from thresholded labels.\\n\\n" "Example usage:\\n" "```python\\n" "# Prepare features (drop target columns using output_targets)\\n" @@ -106,6 +109,7 @@ def _build_agent(self, phase_name: str, phase_prompt: str, tools: list) -> CodeA "# Generate predictions (returns DataFrame with 'prediction' column)\\n" "predictions_df = predictor.predict(X_test)\\n" "y_pred = predictions_df['prediction'].values\\n" + "# For probability metrics, use predictor.predict_proba(X_test)\\n" "```\\n\\n" "## YOUR MISSION:\\n" f"{phase_prompt}\\n\\n" @@ -326,6 +330,10 @@ def _get_phase_1_prompt(task: str, primary_metric_name: str) -> str: f"3. Compute primary metric + 4-6 additional relevant metrics\\n" f"4. ENCOURAGED: Compute 95% confidence intervals using bootstrap (1000 samples)\\n" f"5. Interpret results - what do these numbers tell us?\\n\\n" + f"If computing probability-based metrics (roc_auc, roc_auc_ovr, roc_auc_ovo, log_loss):\\n" + f"- Use predictor.predict_proba(X_test), never thresholded class labels\\n" + f"- Binary: use positive-class scores\\n" + f"- Multiclass: use full per-class probability matrix\\n\\n" f"Register using:\\n" f"register_core_metrics_report(\\n" f" task_type='...', # your detected type\\n" diff --git a/plexe/helpers.py b/plexe/helpers.py index 2c9c203d..e348fce2 100644 --- a/plexe/helpers.py +++ b/plexe/helpers.py @@ -28,6 +28,13 @@ logger = logging.getLogger(__name__) +PROBABILITY_METRICS = { + StandardMetric.ROC_AUC.value, + StandardMetric.ROC_AUC_OVR.value, + StandardMetric.ROC_AUC_OVO.value, + StandardMetric.LOG_LOSS.value, +} + def select_viable_model_types(data_layout: DataLayout, selected_frameworks: list[str] | None = None) -> list[str]: """ @@ -85,6 +92,44 @@ def select_viable_model_types(data_layout: DataLayout, selected_frameworks: list return viable +def metric_requires_probabilities(metric_name: str) -> bool: + """Return True when a metric requires probability scores instead of hard labels.""" + return metric_name.lower().strip() in PROBABILITY_METRICS + + +def normalize_probability_predictions(y_true: np.ndarray, y_pred_proba: Any, metric_name: str) -> np.ndarray: + """ + Normalize probability predictions for sklearn metric compatibility. + + - Binary metrics use positive-class scores (1D array). + - Multiclass metrics use full per-class probability matrix (2D array). + """ + probabilities = y_pred_proba.values if hasattr(y_pred_proba, "values") else np.asarray(y_pred_proba) + metric = metric_name.lower().strip() + n_classes = len(np.unique(y_true)) + + if probabilities.ndim == 1: + if n_classes > 2: + raise ValueError(f"Metric '{metric_name}' requires per-class probabilities for multiclass tasks.") + return probabilities + + if probabilities.ndim != 2: + raise ValueError(f"Expected probability outputs to be 1D or 2D, got shape {probabilities.shape}") + + if probabilities.shape[1] == 1: + probabilities = np.column_stack([1 - probabilities[:, 0], probabilities[:, 0]]) + + if n_classes <= 2: + return probabilities[:, 1] + + if probabilities.shape[1] < n_classes and metric in PROBABILITY_METRICS: + raise ValueError( + f"Metric '{metric_name}' requires {n_classes} class probabilities, got {probabilities.shape[1]} columns." + ) + + return probabilities + + def evaluate_on_sample( spark: SparkSession, sample_uri: str, @@ -170,8 +215,18 @@ def _evaluate_predictor( X = df.drop(columns=columns_to_drop) y = df[target_columns[0]] - predictions = predictor.predict(X)["prediction"].values - return compute_metric(y, predictions, metric, group_ids=group_ids) + if metric_requires_probabilities(metric): + if not hasattr(predictor, "predict_proba") or not callable(predictor.predict_proba): + raise ValueError( + f"Metric '{metric}' requires probability scores, but predictor {type(predictor).__name__} " + "does not implement predict_proba()." + ) + raw_probabilities = predictor.predict_proba(X) + predictions = normalize_probability_predictions(y.values, raw_probabilities, metric) + else: + predictions = predictor.predict(X)["prediction"].values + + return compute_metric(y.values, predictions, metric, group_ids=group_ids) def compute_metric_hardcoded(y_true, y_pred, metric_name: str) -> float: diff --git a/plexe/search/evolutionary_search_policy.py b/plexe/search/evolutionary_search_policy.py index 9a09d173..a1a331a2 100644 --- a/plexe/search/evolutionary_search_policy.py +++ b/plexe/search/evolutionary_search_policy.py @@ -100,9 +100,9 @@ def _calculate_recent_progress(self, journal: SearchJournal, window: int = 5) -> return -0.3 # Slight negative - recent period mostly buggy but we had good solutions before # Calculate trend slope using linear regression - performances = [n.performance for n in good_recent] - x = np.arange(len(performances)) - slope = np.polyfit(x, performances, 1)[0] if len(performances) > 1 else 0.0 + scores = [journal.selection_score(n.performance) for n in good_recent] + x = np.arange(len(scores)) + slope = np.polyfit(x, scores, 1)[0] if len(scores) > 1 else 0.0 # Normalize slope to [-1, 1] range return float(np.clip(slope * 10, -1.0, 1.0)) # Scale for typical performance ranges @@ -112,7 +112,7 @@ def _calculate_stagnation(self, journal: SearchJournal, window: int = 3) -> floa if not journal.good_nodes: return 0.0 # No stagnation if no good solutions yet - best_performance = journal.best_performance + best_score = journal.selection_score(journal.best_performance) recent_nodes = journal.nodes[-window:] if len(journal.nodes) >= window else journal.nodes # Safety check: ensure we have nodes to analyze @@ -125,7 +125,7 @@ def _calculate_stagnation(self, journal: SearchJournal, window: int = 3) -> floa for node in reversed(recent_nodes): if not node.is_buggy and node.performance is not None: - if node.performance >= best_performance - threshold: + if journal.selection_score(node.performance) >= best_score - threshold: improvements += 1 # High stagnation = few improvements in recent window @@ -203,7 +203,7 @@ def _exploit_action(self, journal: SearchJournal, iteration: int, max_iterations temp = max(0.2, (1 - progress) ** 1.5) # Focus on top-k performers - sorted_nodes = sorted(good_nodes, key=lambda n: n.performance, reverse=True) + sorted_nodes = sorted(good_nodes, key=journal.sort_key, reverse=True) top_k = sorted_nodes[:k] if len(top_k) == 1 or temp < 0.25: @@ -211,9 +211,9 @@ def _exploit_action(self, journal: SearchJournal, iteration: int, max_iterations logger.info(f"Action: EXPLOIT (greedy) - solution {selected.solution_id} (perf={selected.performance:.4f})") else: # Softmax selection among top-k - perfs = np.array([n.performance for n in top_k]) + scores = np.array([journal.selection_score(n.performance) for n in top_k]) # Numerical stability: subtract max before exp - exp_probs = np.exp((perfs / temp) - np.max(perfs / temp)) + exp_probs = np.exp((scores / temp) - np.max(scores / temp)) probs = exp_probs / np.sum(exp_probs) selected = self._np_rng.choice(top_k, p=probs) logger.info( @@ -253,7 +253,7 @@ def _mutate_action(self, journal: SearchJournal) -> Solution | None: return self._explore_action(journal) # Prefer solutions with medium performance for mutation (not best, not worst) - sorted_nodes = sorted(good_nodes, key=lambda n: n.performance, reverse=True) + sorted_nodes = sorted(good_nodes, key=journal.sort_key, reverse=True) mid_range_start = max(0, len(sorted_nodes) // 4) mid_range_end = min(len(sorted_nodes), 3 * len(sorted_nodes) // 4) mid_range = sorted_nodes[mid_range_start:mid_range_end] if mid_range_end > mid_range_start else sorted_nodes @@ -273,13 +273,18 @@ def should_stop(self, journal: SearchJournal, iteration: int, max_iterations: in # Early stopping logic (only after halfway point to allow for exploration) if iteration > max_iterations * 0.4: stagnation = self._calculate_stagnation(journal, window=5) + baseline = journal.baseline_performance + best = journal.best_performance + + if journal.optimization_direction == "higher": + has_good_performance = best > baseline * 1.1 + has_exceptional_performance = best > baseline * 1.5 + else: + has_good_performance = best < baseline * 0.9 + has_exceptional_performance = best < baseline * 0.5 # Stop if highly stagnant AND we have a good solution (>10% improvement over baseline) - if ( - stagnation > 0.8 - and journal.best_performance > journal.baseline_performance * 1.1 - and len(journal.good_nodes) >= 2 - ): + if stagnation > 0.8 and has_good_performance and len(journal.good_nodes) >= 2: logger.info( f"Early stopping: High stagnation ({stagnation:.3f}) with good performance " @@ -288,7 +293,7 @@ def should_stop(self, journal: SearchJournal, iteration: int, max_iterations: in return True # Stop if we have exceptional performance (>50% improvement) and some stagnation - if stagnation > 0.6 and journal.best_performance > journal.baseline_performance * 1.5: + if stagnation > 0.6 and has_exceptional_performance: logger.info( f"Early stopping: Exceptional performance ({journal.best_performance:.4f}) " diff --git a/plexe/search/journal.py b/plexe/search/journal.py index 1cbf2534..1bd92450 100644 --- a/plexe/search/journal.py +++ b/plexe/search/journal.py @@ -32,20 +32,47 @@ class SearchJournal: - Improve nodes (enhancements) """ - def __init__(self, baseline: Baseline | None = None): + def __init__(self, baseline: Baseline | None = None, optimization_direction: str = "higher"): """ Initialize journal. Args: baseline: Baseline model for comparison + optimization_direction: Metric optimization direction ("higher" or "lower") """ + if optimization_direction not in {"higher", "lower"}: + raise ValueError(f"optimization_direction must be 'higher' or 'lower', got: {optimization_direction}") + self.baseline = baseline self.baseline_performance = baseline.performance if baseline else 0.0 + self.optimization_direction = optimization_direction self.nodes: list[Solution] = [] self.successful_attempts = 0 self.failed_attempts = 0 + def selection_score(self, value: float) -> float: + """Normalize a metric value so larger always means better.""" + return value if self.optimization_direction == "higher" else -value + + def is_better(self, candidate: float, reference: float | None) -> bool: + """ + Compare two metric values using the configured optimization direction. + + Args: + candidate: Candidate performance value + reference: Reference performance value (or None) + """ + if reference is None: + return True + return self.selection_score(candidate) > self.selection_score(reference) + + def sort_key(self, node: Solution) -> float: + """Direction-aware sort key for solution nodes.""" + if node.performance is None: + return float("-inf") + return self.selection_score(node.performance) + # ============================================ # Adding Nodes # ============================================ @@ -97,7 +124,7 @@ def best_node(self) -> Solution | None: good = self.good_nodes if not good: return None - return max(good, key=lambda n: n.performance) + return max(good, key=self.sort_key) @property def best_performance(self) -> float: @@ -186,11 +213,9 @@ def summarize(self) -> str: best = self.best_node if best: - improvement = ( - (best.performance - self.baseline_performance) / self.baseline_performance * 100 - if self.baseline_performance > 0 - else 0 - ) + score_delta = self.selection_score(best.performance) - self.selection_score(self.baseline_performance) + baseline_scale = abs(self.baseline_performance) + improvement = (score_delta / baseline_scale * 100) if baseline_scale > 0 else 0 summary += ( f" Best: {best.performance:.4f} ({improvement:+.1f}% vs baseline) [solution {best.solution_id}]\n" ) @@ -216,10 +241,10 @@ def get_improvement_trend(self, window: int = 5) -> float: # Look at last N successful attempts recent = successful[-window:] if len(successful) > window else successful - performances = [n.performance for n in recent] + scores = [self.selection_score(n.performance) for n in recent] # Calculate average delta - deltas = [performances[i + 1] - performances[i] for i in range(len(performances) - 1)] + deltas = [scores[i + 1] - scores[i] for i in range(len(scores) - 1)] return sum(deltas) / len(deltas) if deltas else 0.0 @@ -258,7 +283,7 @@ def get_successful_improvements(self, limit: int = 5) -> list[Solution]: child_nodes = [n for n in self.nodes if n.parent is not None and not n.is_buggy and n.performance is not None] # Sort by performance - child_nodes.sort(key=lambda n: n.performance, reverse=True) + child_nodes.sort(key=self.sort_key, reverse=True) return child_nodes[:limit] @@ -271,6 +296,7 @@ def to_dict(self) -> dict: return { "baseline": self.baseline.to_dict() if self.baseline else None, "baseline_performance": self.baseline_performance, + "optimization_direction": self.optimization_direction, "nodes": [node.to_dict() for node in self.nodes], "successful_attempts": self.successful_attempts, "failed_attempts": self.failed_attempts, @@ -289,7 +315,10 @@ def from_dict(d: dict) -> "SearchJournal": baseline = Baseline.from_dict(d["baseline"]) if d.get("baseline") else None # Create journal - journal = SearchJournal(baseline=baseline) + journal = SearchJournal( + baseline=baseline, + optimization_direction=d.get("optimization_direction", "higher"), + ) journal.baseline_performance = d.get("baseline_performance", 0.0) journal.successful_attempts = d.get("successful_attempts", 0) journal.failed_attempts = d.get("failed_attempts", 0) diff --git a/plexe/search/tree_policy.py b/plexe/search/tree_policy.py index 988c674d..49860ead 100644 --- a/plexe/search/tree_policy.py +++ b/plexe/search/tree_policy.py @@ -64,7 +64,7 @@ def decide_next_solution( k = max(1, round(self.num_drafts * (1 - progress))) temp = max(0.3, (1 - progress) ** 2) - sorted_nodes = sorted(journal.good_nodes, key=lambda n: n.performance, reverse=True) + sorted_nodes = sorted(journal.good_nodes, key=journal.sort_key, reverse=True) top_k = sorted_nodes[:k] # Greedy if k=1 or low temperature @@ -73,8 +73,8 @@ def decide_next_solution( return top_k[0] # Softmax sampling - perfs = np.array([n.performance for n in top_k]) - probs = np.exp((perfs / temp) - np.max(perfs / temp)) + scores = np.array([journal.selection_score(n.performance) for n in top_k]) + probs = np.exp((scores / temp) - np.max(scores / temp)) probs /= probs.sum() selected = self._np_rng.choice(top_k, p=probs) diff --git a/plexe/templates/inference/catboost_predictor.py b/plexe/templates/inference/catboost_predictor.py index 51797d5d..439b6edc 100644 --- a/plexe/templates/inference/catboost_predictor.py +++ b/plexe/templates/inference/catboost_predictor.py @@ -6,8 +6,10 @@ """ from pathlib import Path +import json import joblib +import numpy as np import pandas as pd from catboost import CatBoostClassifier, CatBoostRegressor @@ -29,6 +31,14 @@ def __init__(self, model_dir: str): model_dir = Path(model_dir) artifacts_dir = model_dir / "artifacts" + metadata_path = artifacts_dir / "metadata.json" + if metadata_path.exists(): + with open(metadata_path) as f: + metadata = json.load(f) + self._task_type = metadata.get("task_type", "") + else: + self._task_type = "" + # Execute pipeline code (defines custom FunctionTransformer functions) code_path = model_dir / "src" / "pipeline.py" if code_path.exists(): @@ -71,6 +81,29 @@ def predict(self, x: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame({"prediction": predictions}) + def predict_proba(self, x: pd.DataFrame) -> pd.DataFrame: + """ + Predict per-class probabilities on input DataFrame. + + Returns: + DataFrame with probability columns named proba_0..proba_n. + """ + if self._task_type not in {"binary_classification", "multiclass_classification"}: + raise ValueError( + f"predict_proba() is only valid for classification tasks, got task_type='{self._task_type or 'unknown'}'" + ) + if not isinstance(self.model, CatBoostClassifier): + raise ValueError(f"Model type {type(self.model).__name__} does not support predict_proba().") + + probabilities = np.asarray(self.model.predict_proba(self.pipeline.transform(x))) + if probabilities.ndim == 1: + probabilities = probabilities.reshape(-1, 1) + if probabilities.shape[1] == 1: + probabilities = np.column_stack([1 - probabilities[:, 0], probabilities[:, 0]]) + + columns = [f"proba_{i}" for i in range(probabilities.shape[1])] + return pd.DataFrame(probabilities, columns=columns) + # ============================================ # Example Usage diff --git a/plexe/templates/inference/keras_predictor.py b/plexe/templates/inference/keras_predictor.py index e3e4d904..6af1587d 100644 --- a/plexe/templates/inference/keras_predictor.py +++ b/plexe/templates/inference/keras_predictor.py @@ -42,8 +42,12 @@ def __init__(self, model_dir: str): with open(metadata_path) as f: metadata = json.load(f) raw_task_type = metadata.get("task_type", "") + self._loss_class = metadata.get("loss_class", "") + self._loss_config = metadata.get("loss_config", {}) or {} else: raw_task_type = "" + self._loss_class = "" + self._loss_config = {} self._task_type = raw_task_type @@ -60,6 +64,54 @@ def __init__(self, model_dir: str): with open(artifacts_dir / "pipeline.pkl", "rb") as f: self.pipeline = cloudpickle.load(f) + def _uses_logits_output(self) -> bool: + """Return True when model outputs are logits based on training loss metadata.""" + from_logits = bool(self._loss_config.get("from_logits", False)) + if self._task_type == "binary_classification" and self._loss_class == "BinaryCrossentropy": + return from_logits + if self._task_type == "multiclass_classification" and self._loss_class in { + "SparseCategoricalCrossentropy", + "CategoricalCrossentropy", + }: + return from_logits + return False + + def _probabilities_from_raw(self, raw_predictions): + """Convert raw model outputs into probability arrays.""" + import numpy as np + + probabilities = np.asarray(raw_predictions) + if probabilities.ndim == 1: + probabilities = probabilities.reshape(-1, 1) + + if self._task_type == "binary_classification": + if probabilities.shape[1] == 1: + positive = probabilities[:, 0] + if self._uses_logits_output(): + positive = 1.0 / (1.0 + np.exp(-positive)) + probabilities = np.column_stack([1 - positive, positive]) + return probabilities + + if probabilities.shape[1] == 2: + if self._uses_logits_output(): + shifted = probabilities - np.max(probabilities, axis=1, keepdims=True) + exp_values = np.exp(shifted) + probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True) + return probabilities + + raise ValueError(f"Binary classification expects 1 or 2 outputs, got shape {probabilities.shape}") + + if self._task_type == "multiclass_classification": + if self._uses_logits_output(): + shifted = probabilities - np.max(probabilities, axis=1, keepdims=True) + exp_values = np.exp(shifted) + probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True) + return probabilities + + raise ValueError( + f"predict_proba() is only valid for classification tasks, got task_type='{self._task_type or 'unknown'}'" + ) + def predict(self, x: pd.DataFrame) -> pd.DataFrame: """ Make predictions on input DataFrame. @@ -80,11 +132,11 @@ def predict(self, x: pd.DataFrame) -> pd.DataFrame: # Post-process based on task_type from metadata if self._task_type == "binary_classification": - # Keras outputs probabilities — threshold at 0.5 - predictions = raw_predictions.squeeze() - predictions = (predictions > 0.5).astype(int) + probabilities = self._probabilities_from_raw(raw_predictions) + predictions = (probabilities[:, 1] > 0.5).astype(int) elif self._task_type == "multiclass_classification": - predictions = np.argmax(raw_predictions, axis=1) + probabilities = self._probabilities_from_raw(raw_predictions) + predictions = np.argmax(probabilities, axis=1) else: # Regression or unknown: return raw values predictions = raw_predictions.squeeze() @@ -95,19 +147,12 @@ def predict_proba(self, x: pd.DataFrame) -> pd.DataFrame: """ Predict per-class probabilities on input DataFrame. - Returns raw model outputs (sigmoid/softmax values) without argmax. + Returns per-class probabilities with columns proba_0..proba_n. """ - import numpy as np - x_transformed = self.pipeline.transform(x) raw_predictions = self.model.predict(x_transformed, verbose=0) - probabilities = np.asarray(raw_predictions) - if probabilities.ndim == 1: - probabilities = probabilities.reshape(-1, 1) - - if probabilities.shape[1] == 1: - probabilities = np.column_stack([1 - probabilities[:, 0], probabilities[:, 0]]) + probabilities = self._probabilities_from_raw(raw_predictions) columns = [f"proba_{i}" for i in range(probabilities.shape[1])] return pd.DataFrame(probabilities, columns=columns) diff --git a/plexe/templates/inference/lightgbm_predictor.py b/plexe/templates/inference/lightgbm_predictor.py index fb503c82..7eaa3ab4 100644 --- a/plexe/templates/inference/lightgbm_predictor.py +++ b/plexe/templates/inference/lightgbm_predictor.py @@ -6,8 +6,10 @@ """ from pathlib import Path +import json import joblib +import numpy as np import pandas as pd @@ -28,6 +30,14 @@ def __init__(self, model_dir: str): model_dir = Path(model_dir) artifacts_dir = model_dir / "artifacts" + metadata_path = artifacts_dir / "metadata.json" + if metadata_path.exists(): + with open(metadata_path) as f: + metadata = json.load(f) + self._task_type = metadata.get("task_type", "") + else: + self._task_type = "" + # Execute pipeline code (defines custom FunctionTransformer functions) code_path = model_dir / "src" / "pipeline.py" if code_path.exists(): @@ -61,6 +71,29 @@ def predict(self, x: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame({"prediction": predictions}) + def predict_proba(self, x: pd.DataFrame) -> pd.DataFrame: + """ + Predict per-class probabilities on input DataFrame. + + Returns: + DataFrame with probability columns named proba_0..proba_n. + """ + if self._task_type not in {"binary_classification", "multiclass_classification"}: + raise ValueError( + f"predict_proba() is only valid for classification tasks, got task_type='{self._task_type or 'unknown'}'" + ) + if not hasattr(self.model, "predict_proba"): + raise ValueError(f"Model type {type(self.model).__name__} does not support predict_proba().") + + probabilities = np.asarray(self.model.predict_proba(self.pipeline.transform(x))) + if probabilities.ndim == 1: + probabilities = probabilities.reshape(-1, 1) + if probabilities.shape[1] == 1: + probabilities = np.column_stack([1 - probabilities[:, 0], probabilities[:, 0]]) + + columns = [f"proba_{i}" for i in range(probabilities.shape[1])] + return pd.DataFrame(probabilities, columns=columns) + # ============================================ # Example Usage diff --git a/plexe/templates/inference/pytorch_predictor.py b/plexe/templates/inference/pytorch_predictor.py index 0faad4cd..b43fd6d4 100644 --- a/plexe/templates/inference/pytorch_predictor.py +++ b/plexe/templates/inference/pytorch_predictor.py @@ -109,6 +109,11 @@ def predict_proba(self, x: pd.DataFrame) -> pd.DataFrame: Applies sigmoid for single-logit binary models, otherwise softmax. """ + if self._task_type not in {"binary_classification", "multiclass_classification"}: + raise ValueError( + f"predict_proba() is only valid for classification tasks, got task_type='{self._task_type or 'unknown'}'" + ) + # Transform features through pipeline x_transformed = self.pipeline.transform(x) diff --git a/plexe/templates/inference/xgboost_predictor.py b/plexe/templates/inference/xgboost_predictor.py index 02dd7880..9968a688 100644 --- a/plexe/templates/inference/xgboost_predictor.py +++ b/plexe/templates/inference/xgboost_predictor.py @@ -6,8 +6,10 @@ """ from pathlib import Path +import json import joblib +import numpy as np import pandas as pd @@ -28,6 +30,14 @@ def __init__(self, model_dir: str): model_dir = Path(model_dir) artifacts_dir = model_dir / "artifacts" + metadata_path = artifacts_dir / "metadata.json" + if metadata_path.exists(): + with open(metadata_path) as f: + metadata = json.load(f) + self._task_type = metadata.get("task_type", "") + else: + self._task_type = "" + # Execute pipeline code (defines custom FunctionTransformer functions) code_path = model_dir / "src" / "pipeline.py" if code_path.exists(): @@ -61,6 +71,29 @@ def predict(self, x: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame({"prediction": predictions}) + def predict_proba(self, x: pd.DataFrame) -> pd.DataFrame: + """ + Predict per-class probabilities on input DataFrame. + + Returns: + DataFrame with probability columns named proba_0..proba_n. + """ + if self._task_type not in {"binary_classification", "multiclass_classification"}: + raise ValueError( + f"predict_proba() is only valid for classification tasks, got task_type='{self._task_type or 'unknown'}'" + ) + if not hasattr(self.model, "predict_proba"): + raise ValueError(f"Model type {type(self.model).__name__} does not support predict_proba().") + + probabilities = np.asarray(self.model.predict_proba(self.pipeline.transform(x))) + if probabilities.ndim == 1: + probabilities = probabilities.reshape(-1, 1) + if probabilities.shape[1] == 1: + probabilities = np.column_stack([1 - probabilities[:, 0], probabilities[:, 0]]) + + columns = [f"proba_{i}" for i in range(probabilities.shape[1])] + return pd.DataFrame(probabilities, columns=columns) + # ============================================ # Example Usage diff --git a/plexe/tools/submission.py b/plexe/tools/submission.py index 36c41500..8589f9ca 100644 --- a/plexe/tools/submission.py +++ b/plexe/tools/submission.py @@ -960,7 +960,11 @@ def validate_baseline_predictor(predictor: Any, name: str, description: str) -> ValueError: If validation or metric computation fails """ import numpy as np - from plexe.helpers import compute_metric + from plexe.helpers import ( + compute_metric, + metric_requires_probabilities, + normalize_probability_predictions, + ) # Check class name matches template if type(predictor).__name__ != "HeuristicBaselinePredictor": @@ -974,18 +978,37 @@ def validate_baseline_predictor(predictor: Any, name: str, description: str) -> logger.error(error_msg) raise ValueError(error_msg) + requires_proba = metric_requires_probabilities(context.metric.name) + if requires_proba and (not hasattr(predictor, "predict_proba") or not callable(predictor.predict_proba)): + error_msg = ( + f"Primary metric '{context.metric.name}' requires probability scores. " + "Baseline predictor must implement callable .predict_proba()" + ) + logger.error(error_msg) + raise ValueError(error_msg) + # Test on small validation sample try: X_test = val_sample_df.drop(columns=context.output_targets, errors="ignore").head(10) - predictions = predictor.predict(X_test) - - if not isinstance(predictions, list | tuple | np.ndarray | pd.Series): - error_msg = f"predict() must return array-like, got {type(predictions)}" + predictions = predictor.predict_proba(X_test) if requires_proba else predictor.predict(X_test) + + expected_types = ( + (list, tuple, np.ndarray, pd.Series, pd.DataFrame) + if requires_proba + else ( + list, + tuple, + np.ndarray, + pd.Series, + ) + ) + if not isinstance(predictions, expected_types): + error_msg = f"{'predict_proba' if requires_proba else 'predict'}() must return array-like, got {type(predictions)}" logger.error(error_msg) raise ValueError(error_msg) if len(predictions) != len(X_test): - error_msg = f"predict() returned {len(predictions)} predictions for {len(X_test)} samples" + error_msg = f"{'predict_proba' if requires_proba else 'predict'}() returned {len(predictions)} predictions for {len(X_test)} samples" logger.error(error_msg) raise ValueError(error_msg) @@ -998,10 +1021,14 @@ def validate_baseline_predictor(predictor: Any, name: str, description: str) -> try: X_val = val_sample_df.drop(columns=context.output_targets, errors="ignore") y_val = val_sample_df[context.output_targets[0]] - y_pred = predictor.predict(X_val) + if requires_proba: + raw_proba = predictor.predict_proba(X_val) + y_pred_input = normalize_probability_predictions(y_val.values, raw_proba, context.metric.name) + else: + y_pred_input = predictor.predict(X_val) # This is where squared= errors would happen - agent can now see them! - performance = compute_metric(y_true=y_val.values, y_pred=y_pred, metric_name=context.metric.name) + performance = compute_metric(y_true=y_val.values, y_pred=y_pred_input, metric_name=context.metric.name) logger.info(f"Baseline performance: {context.metric.name}={performance:.4f}") @@ -1120,7 +1147,11 @@ def evaluate_baseline_performance() -> str: Returns: String with performance metric value """ - from plexe.helpers import compute_metric + from plexe.helpers import ( + compute_metric, + metric_requires_probabilities, + normalize_probability_predictions, + ) # Check prerequisites if context.baseline_predictor is None: @@ -1147,12 +1178,23 @@ def evaluate_baseline_performance() -> str: else None ) - # Make predictions (standard array interface) - y_pred = context.baseline_predictor.predict(X_val) + requires_proba = metric_requires_probabilities(context.metric.name) + if requires_proba: + if not hasattr(context.baseline_predictor, "predict_proba") or not callable( + context.baseline_predictor.predict_proba + ): + raise ValueError( + f"Metric '{context.metric.name}' requires probability scores but baseline predictor " + "does not implement predict_proba()." + ) + raw_proba = context.baseline_predictor.predict_proba(X_val) + y_pred_input = normalize_probability_predictions(y_val.values, raw_proba, context.metric.name) + else: + y_pred_input = context.baseline_predictor.predict(X_val) # Compute metric (pass group_ids for ranking metrics) performance = compute_metric( - y_true=y_val.values, y_pred=y_pred, metric_name=context.metric.name, group_ids=group_ids + y_true=y_val.values, y_pred=y_pred_input, metric_name=context.metric.name, group_ids=group_ids ) # Save performance diff --git a/plexe/workflow.py b/plexe/workflow.py index 6076f114..3b0b519a 100644 --- a/plexe/workflow.py +++ b/plexe/workflow.py @@ -178,6 +178,8 @@ def build_model( context = BuildContext.from_dict(checkpoint_data["context"]) if checkpoint_data.get("search_journal"): journal = SearchJournal.from_dict(checkpoint_data["search_journal"]) + if context.metric: + journal.optimization_direction = context.metric.optimization_direction context.scratch["_search_journal"] = journal logger.info(f"Restored SearchJournal with {len(journal.nodes)} solutions") if checkpoint_data.get("insight_store"): @@ -437,7 +439,7 @@ def build_model( if valid_alternatives: # Sort by performance and pick the best alternative - valid_alternatives.sort(key=lambda s: s.performance, reverse=True) + valid_alternatives.sort(key=journal.sort_key, reverse=True) fallback_solution = valid_alternatives[0] logger.info(f"Found {len(valid_alternatives)} valid alternatives") @@ -1342,9 +1344,13 @@ def search_models( # Use restored journal/insight_store if resuming, otherwise create fresh if restored_journal: journal = restored_journal + journal.optimization_direction = context.metric.optimization_direction logger.info(f"Using restored SearchJournal with {len(journal.nodes)} existing solutions") else: - journal = SearchJournal(baseline=context.heuristic_baseline) + journal = SearchJournal( + baseline=context.heuristic_baseline, + optimization_direction=context.metric.optimization_direction, + ) if restored_insight_store: insight_store = restored_insight_store diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md index 50861deb..807b17b7 100644 --- a/tests/CODE_INDEX.md +++ b/tests/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: tests -> Generated on 2026-03-03 00:06:47 +> Generated on 2026-03-03 02:35:53 Test suite structure and test case documentation. @@ -75,6 +75,14 @@ Tests for user feedback integration in agents. - `test_hypothesiser_includes_feedback(self, mock_context, mock_config)` - HypothesiserAgent should include feedback in instructions. - `test_agent_without_feedback_works(self, mock_context, mock_config)` - Agents should work normally when no feedback is provided. +--- +## `unit/agents/test_model_evaluator_prompt.py` +Prompt-level tests for ModelEvaluatorAgent probability guidance. + +**Functions:** +- `test_phase_1_prompt_includes_probability_metric_guidance()` - No description +- `test_build_agent_instructions_document_predict_proba_interface()` - No description + --- ## `unit/execution/training/test_local_runner.py` Tests for LocalProcessRunner GPU detection and command construction. @@ -99,6 +107,7 @@ Determinism tests for EvolutionarySearchPolicy local RNG behavior. **Functions:** - `test_evolutionary_policy_determinism(monkeypatch, tmp_path)` - No description +- `test_evolutionary_exploit_respects_lower_metric_direction(tmp_path)` - No description --- ## `unit/search/test_insight_store.py` @@ -117,6 +126,7 @@ Unit tests for SearchJournal. - `test_journal_add_successful_node()` - Test recording a successful solution. - `test_journal_add_buggy_node()` - Test recording a failed attempt. - `test_journal_best_node_tracks_best()` - Test best_node returns the highest performing solution. +- `test_journal_best_node_respects_lower_direction()` - best_node should select smallest metric when optimization is lower. - `test_journal_failure_rate()` - Test failure rate computation. - `test_journal_failure_rate_empty()` - Test failure rate on empty journal. - `test_journal_get_history()` - Test history returns recent entries. @@ -124,6 +134,8 @@ Unit tests for SearchJournal. - `test_journal_improvement_trend_insufficient_data()` - Test improvement trend with fewer than 2 successful solutions. - `test_journal_get_history_includes_train_performance()` - get_history should include train_performance when set on a solution. - `test_journal_get_history_train_performance_none()` - get_history should include train_performance=None when not set. +- `test_journal_serialization_preserves_optimization_direction()` - to_dict/from_dict should preserve optimization_direction. +- `test_journal_from_dict_defaults_optimization_direction_to_higher()` - Older checkpoints without optimization_direction should default to higher. --- ## `unit/search/test_tree_policy_determinism.py` @@ -131,6 +143,7 @@ Determinism tests for TreeSearchPolicy local RNG behavior. **Functions:** - `test_tree_policy_determinism(monkeypatch, tmp_path)` - No description +- `test_tree_policy_respects_lower_metric_direction(tmp_path)` - No description --- ## `unit/templates/features/test_pipeline_runner.py` @@ -159,6 +172,25 @@ Unit tests for PyTorch DataLoader worker fallback behavior. - `test_resolve_num_workers_uses_context_when_start_method_is_none(monkeypatch) -> None` - When get_start_method returns None, context start method should be used. - `test_resolve_num_workers_kept_on_non_darwin_spawn(monkeypatch) -> None` - Spawn on non-macOS should keep the requested worker count. +--- +## `unit/test_baseline_probability_validation.py` +Unit tests for baseline probability validation behavior. + +**Functions:** +- `test_validate_baseline_predictor_requires_predict_proba_for_probability_metrics(tmp_path)` - No description +- `test_validate_baseline_predictor_accepts_predict_proba_for_probability_metrics(tmp_path)` - No description + +--- +## `unit/test_catboost_predictor.py` +Unit tests for CatBoost predictor template. + +**`DummyPipeline`** - Minimal pipeline stub for tests. +- `transform(self, x)` - No description + +**Functions:** +- `test_catboost_predictor_predict_proba_classification() -> None` - No description +- `test_catboost_predictor_predict_proba_raises_for_regression() -> None` - No description + --- ## `unit/test_config.py` Unit tests for config helpers. @@ -182,6 +214,13 @@ Unit tests for workflow helper functions. - `test_select_viable_model_types_defaults_image()` - Default model types intersect with IMAGE_PATH. - `test_select_viable_model_types_no_intersection()` - No compatible frameworks should raise ValueError. - `test_compute_metric_map_grouped()` - MAP should compute per-group and average. +- `test_metric_requires_probabilities()` - No description +- `test_normalize_probability_predictions_binary_matrix_to_positive_scores()` - No description +- `test_normalize_probability_predictions_multiclass_keeps_matrix()` - No description +- `test_normalize_probability_predictions_multiclass_raises_on_1d()` - No description +- `test_evaluate_predictor_uses_predict_for_label_metrics()` - No description +- `test_evaluate_predictor_uses_predict_proba_for_probability_metrics()` - No description +- `test_evaluate_predictor_raises_when_probability_metric_missing_predict_proba()` - No description --- ## `unit/test_imports.py` @@ -190,6 +229,23 @@ Test that all production modules can be imported without errors. **Functions:** - `test_all_modules_importable()` - Import all production modules in the plexe/ package to catch import errors. +--- +## `unit/test_keras_predictor.py` +Unit tests for Keras predictor template semantics. + +**`DummyPipeline`** - Minimal pipeline stub for tests. +- `transform(self, x)` - No description + +**`DummyModel`** - Minimal model stub for tests. +- `__init__(self, output)` +- `predict(self, x, verbose)` - No description + +**Functions:** +- `test_keras_probabilities_from_binary_logits() -> None` - No description +- `test_keras_probabilities_from_binary_two_logit_output() -> None` - No description +- `test_keras_probabilities_from_multiclass_logits() -> None` - No description +- `test_keras_predict_proba_raises_for_regression() -> None` - No description + --- ## `unit/test_lightgbm_predictor.py` Unit tests for LightGBM predictor template. @@ -197,12 +253,17 @@ Unit tests for LightGBM predictor template. **`DummyModel`** - Minimal model stub with a predict method. - `predict(self, x)` - No description +**`DummyClassificationModel`** - Minimal model stub with predict_proba for classification. +- `predict_proba(self, x)` - No description + **`DummyPipeline`** - Minimal pipeline stub with a transform method. - `transform(self, x)` - No description **Functions:** - `test_lightgbm_predictor_basic(tmp_path: Path) -> None` - No description - `test_lightgbm_predictor_label_encoder(tmp_path: Path) -> None` - No description +- `test_lightgbm_predictor_predict_proba_classification(tmp_path: Path) -> None` - No description +- `test_lightgbm_predictor_predict_proba_raises_for_regression(tmp_path: Path) -> None` - No description --- ## `unit/test_models.py` @@ -215,6 +276,20 @@ Unit tests for core model dataclasses. - `test_solution_from_dict_backward_compatible()` - Old checkpoints missing train_performance should deserialize cleanly. - `test_solution_from_dict_with_train_performance()` - Checkpoints with train_performance should round-trip correctly. +--- +## `unit/test_pytorch_predictor.py` +Unit tests for PyTorch predictor template semantics. + +**`DummyPipeline`** - Minimal pipeline stub for tests. +- `transform(self, x)` - No description + +**`DummyModel`** - Minimal callable model stub for tests. +- `__init__(self, outputs)` + +**Functions:** +- `test_pytorch_predict_proba_binary_classification() -> None` - No description +- `test_pytorch_predict_proba_raises_for_regression() -> None` - No description + --- ## `unit/test_submission_pytorch.py` Unit tests for PyTorch model submission. @@ -222,6 +297,21 @@ Unit tests for PyTorch model submission. **Functions:** - `test_save_model_pytorch(tmp_path)` - Test PyTorch model submission validation and context scratch storage. +--- +## `unit/test_xgboost_predictor.py` +Unit tests for XGBoost predictor template. + +**`DummyModel`** - Minimal predictor stub for tests. +- `predict(self, x)` - No description +- `predict_proba(self, x)` - No description + +**`DummyPipeline`** - Minimal pipeline stub for tests. +- `transform(self, x)` - No description + +**Functions:** +- `test_xgboost_predictor_predict_proba_classification(tmp_path: Path) -> None` - No description +- `test_xgboost_predictor_predict_proba_raises_for_regression(tmp_path: Path) -> None` - No description + --- ## `unit/utils/test_parquet_dataset.py` Tests for streaming parquet data loading utilities. diff --git a/tests/unit/agents/test_model_evaluator_prompt.py b/tests/unit/agents/test_model_evaluator_prompt.py new file mode 100644 index 00000000..ad19bc53 --- /dev/null +++ b/tests/unit/agents/test_model_evaluator_prompt.py @@ -0,0 +1,23 @@ +"""Prompt-level tests for ModelEvaluatorAgent probability guidance.""" + +from __future__ import annotations + +import inspect + +from plexe.agents.model_evaluator import ModelEvaluatorAgent + + +def test_phase_1_prompt_includes_probability_metric_guidance(): + prompt = ModelEvaluatorAgent._get_phase_1_prompt("Predict churn", "roc_auc") + + assert "predict_proba" in prompt + assert "roc_auc" in prompt + assert "Binary: use positive-class scores" in prompt + assert "Multiclass: use full per-class probability matrix" in prompt + + +def test_build_agent_instructions_document_predict_proba_interface(): + source = inspect.getsource(ModelEvaluatorAgent._build_agent) + + assert "predict_proba" in source + assert "probability-based" in source diff --git a/tests/unit/search/test_evolutionary_policy_determinism.py b/tests/unit/search/test_evolutionary_policy_determinism.py index 93bfc448..e76b4738 100644 --- a/tests/unit/search/test_evolutionary_policy_determinism.py +++ b/tests/unit/search/test_evolutionary_policy_determinism.py @@ -56,3 +56,15 @@ def _fail(*args, **kwargs): assert (selected_a is None) == (selected_b is None) if selected_a is not None: assert selected_a.solution_id == selected_b.solution_id + + +def test_evolutionary_exploit_respects_lower_metric_direction(tmp_path): + journal = SearchJournal(optimization_direction="lower") + for idx, perf in enumerate([0.9, 0.3, 0.2], start=1): + journal.add_node(_make_solution(idx, performance=perf)) + + policy = EvolutionarySearchPolicy(num_drafts=2, seed=456) + selected = policy._exploit_action(journal, iteration=9, max_iterations=10) + + assert selected is not None + assert selected.solution_id == 3 diff --git a/tests/unit/search/test_journal.py b/tests/unit/search/test_journal.py index b76264c9..9f8a31e8 100644 --- a/tests/unit/search/test_journal.py +++ b/tests/unit/search/test_journal.py @@ -102,6 +102,22 @@ def test_journal_best_node_tracks_best(): assert journal.best_performance == 0.90 +def test_journal_best_node_respects_lower_direction(): + """best_node should select smallest metric when optimization is lower.""" + journal = SearchJournal(optimization_direction="lower") + + sol1 = _make_solution(0, performance=0.40) + sol2 = _make_solution(1, performance=0.25) + sol3 = _make_solution(2, performance=0.31) + + journal.add_node(sol1) + journal.add_node(sol2) + journal.add_node(sol3) + + assert journal.best_node == sol2 + assert journal.best_performance == 0.25 + + # ============================================ # Failure Rate Tests # ============================================ @@ -195,3 +211,24 @@ def test_journal_get_history_train_performance_none(): history = journal.get_history() assert history[0]["train_performance"] is None + + +def test_journal_serialization_preserves_optimization_direction(): + """to_dict/from_dict should preserve optimization_direction.""" + journal = SearchJournal(optimization_direction="lower") + journal.add_node(_make_solution(0, performance=0.3)) + + restored = SearchJournal.from_dict(journal.to_dict()) + assert restored.optimization_direction == "lower" + assert restored.best_performance == pytest.approx(0.3) + + +def test_journal_from_dict_defaults_optimization_direction_to_higher(): + """Older checkpoints without optimization_direction should default to higher.""" + journal = SearchJournal() + journal.add_node(_make_solution(0, performance=0.3)) + payload = journal.to_dict() + payload.pop("optimization_direction") + + restored = SearchJournal.from_dict(payload) + assert restored.optimization_direction == "higher" diff --git a/tests/unit/search/test_tree_policy_determinism.py b/tests/unit/search/test_tree_policy_determinism.py index 1e876f1c..98d73401 100644 --- a/tests/unit/search/test_tree_policy_determinism.py +++ b/tests/unit/search/test_tree_policy_determinism.py @@ -55,3 +55,17 @@ def _fail(*args, **kwargs): assert selected_a is not None assert selected_b is not None assert selected_a.solution_id == selected_b.solution_id + + +def test_tree_policy_respects_lower_metric_direction(tmp_path): + journal = SearchJournal(optimization_direction="lower") + for idx, perf in enumerate([0.9, 0.2, 0.5], start=1): + journal.add_node(_make_solution(idx, performance=perf)) + + context = _make_context(tmp_path) + policy = TreeSearchPolicy(num_drafts=2, debug_prob=0.0, seed=123) + + selected = policy.decide_next_solution(journal, context, iteration=9, max_iterations=10) + + assert selected is not None + assert selected.solution_id == 2 diff --git a/tests/unit/test_baseline_probability_validation.py b/tests/unit/test_baseline_probability_validation.py new file mode 100644 index 00000000..85c795db --- /dev/null +++ b/tests/unit/test_baseline_probability_validation.py @@ -0,0 +1,63 @@ +"""Unit tests for baseline probability validation behavior.""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest + +from plexe.models import BuildContext, Metric +from plexe.tools.submission import get_validate_baseline_predictor_tool + + +def _make_context(tmp_path) -> BuildContext: + context = BuildContext( + user_id="user", + experiment_id="exp", + dataset_uri="/tmp/dataset.parquet", + work_dir=tmp_path, + intent="predict churn", + ) + context.output_targets = ["target"] + return context + + +def test_validate_baseline_predictor_requires_predict_proba_for_probability_metrics(tmp_path): + context = _make_context(tmp_path) + context.metric = Metric(name="roc_auc", optimization_direction="higher") + val_df = pd.DataFrame({"feature": [1, 2, 3, 4], "target": [0, 1, 1, 0]}) + + class HeuristicBaselinePredictor: + def predict(self, x): + return np.zeros(len(x), dtype=int) + + validate_tool = get_validate_baseline_predictor_tool(context, val_df) + predictor = HeuristicBaselinePredictor() + + with pytest.raises(ValueError, match="requires probability scores"): + validate_tool(predictor, "baseline", "missing predict_proba") + + +def test_validate_baseline_predictor_accepts_predict_proba_for_probability_metrics(tmp_path): + context = _make_context(tmp_path) + context.metric = Metric(name="roc_auc", optimization_direction="higher") + val_df = pd.DataFrame({"feature": [1, 2, 3, 4], "target": [0, 1, 1, 0]}) + + class HeuristicBaselinePredictor: + def predict(self, x): + return np.zeros(len(x), dtype=int) + + def predict_proba(self, x): + return pd.DataFrame( + { + "proba_0": np.array([0.9, 0.2, 0.1, 0.8]), + "proba_1": np.array([0.1, 0.8, 0.9, 0.2]), + } + ) + + validate_tool = get_validate_baseline_predictor_tool(context, val_df) + predictor = HeuristicBaselinePredictor() + message = validate_tool(predictor, "baseline", "has predict_proba") + + assert "validated" in message.lower() + assert context.baseline_performance is not None diff --git a/tests/unit/test_catboost_predictor.py b/tests/unit/test_catboost_predictor.py new file mode 100644 index 00000000..c7c32a31 --- /dev/null +++ b/tests/unit/test_catboost_predictor.py @@ -0,0 +1,45 @@ +"""Unit tests for CatBoost predictor template.""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest + +catboost = pytest.importorskip("catboost") + +from plexe.templates.inference.catboost_predictor import CatBoostPredictor + + +class DummyPipeline: + """Minimal pipeline stub for tests.""" + + def transform(self, x): + return x + + +def test_catboost_predictor_predict_proba_classification() -> None: + model = catboost.CatBoostClassifier(iterations=5, verbose=False) + X_train = pd.DataFrame({"f1": [0.0, 0.1, 0.9, 1.0], "f2": [0.0, 0.2, 0.8, 1.0]}) + y_train = np.array([0, 0, 1, 1]) + model.fit(X_train, y_train) + + predictor = CatBoostPredictor.__new__(CatBoostPredictor) + predictor._task_type = "binary_classification" + predictor.model = model + predictor.pipeline = DummyPipeline() + + probabilities = predictor.predict_proba(pd.DataFrame({"f1": [0.05, 0.95], "f2": [0.05, 0.95]})) + + assert list(probabilities.columns) == ["proba_0", "proba_1"] + assert len(probabilities) == 2 + + +def test_catboost_predictor_predict_proba_raises_for_regression() -> None: + predictor = CatBoostPredictor.__new__(CatBoostPredictor) + predictor._task_type = "regression" + predictor.model = object() + predictor.pipeline = DummyPipeline() + + with pytest.raises(ValueError, match="only valid for classification"): + predictor.predict_proba(pd.DataFrame({"f1": [0.1], "f2": [0.2]})) diff --git a/tests/unit/test_helpers.py b/tests/unit/test_helpers.py index 3902f00c..6f839e92 100644 --- a/tests/unit/test_helpers.py +++ b/tests/unit/test_helpers.py @@ -3,11 +3,18 @@ """ import numpy as np +import pandas as pd import pytest from sklearn.metrics import accuracy_score from plexe.config import DEFAULT_MODEL_TYPES, ModelType, detect_installed_frameworks -from plexe.helpers import compute_metric, select_viable_model_types +from plexe.helpers import ( + _evaluate_predictor, + compute_metric, + metric_requires_probabilities, + normalize_probability_predictions, + select_viable_model_types, +) from plexe.models import DataLayout @@ -85,3 +92,137 @@ def test_compute_metric_map_grouped(): result = compute_metric(y_true, y_pred, "map", group_ids=group_ids) assert result == pytest.approx(0.75) + + +def test_metric_requires_probabilities(): + assert metric_requires_probabilities("roc_auc") + assert metric_requires_probabilities("log_loss") + assert not metric_requires_probabilities("accuracy") + + +def test_normalize_probability_predictions_binary_matrix_to_positive_scores(): + y_true = np.array([0, 1, 1, 0]) + probs = np.array([[0.8, 0.2], [0.1, 0.9], [0.2, 0.8], [0.9, 0.1]]) + + normalized = normalize_probability_predictions(y_true, probs, "roc_auc") + + assert normalized.shape == (4,) + assert np.allclose(normalized, np.array([0.2, 0.9, 0.8, 0.1])) + + +def test_normalize_probability_predictions_multiclass_keeps_matrix(): + y_true = np.array([0, 1, 2]) + probs = np.array( + [ + [0.8, 0.1, 0.1], + [0.1, 0.7, 0.2], + [0.1, 0.2, 0.7], + ] + ) + + normalized = normalize_probability_predictions(y_true, probs, "roc_auc_ovr") + + assert normalized.shape == (3, 3) + assert np.allclose(normalized, probs) + + +def test_normalize_probability_predictions_multiclass_raises_on_1d(): + y_true = np.array([0, 1, 2]) + probs = np.array([0.2, 0.6, 0.4]) + + with pytest.raises(ValueError, match="per-class probabilities"): + normalize_probability_predictions(y_true, probs, "log_loss") + + +class _DummySparkDataFrame: + def __init__(self, pdf: pd.DataFrame): + self._pdf = pdf + + def toPandas(self) -> pd.DataFrame: + return self._pdf + + +class _DummySparkReader: + def __init__(self, pdf: pd.DataFrame): + self._pdf = pdf + + def parquet(self, _uri: str) -> _DummySparkDataFrame: + return _DummySparkDataFrame(self._pdf) + + +class _DummySpark: + def __init__(self, pdf: pd.DataFrame): + self.read = _DummySparkReader(pdf) + + +class _PredictorWithProba: + def __init__(self): + self.predict_calls = 0 + self.predict_proba_calls = 0 + + def predict(self, x: pd.DataFrame) -> pd.DataFrame: + self.predict_calls += 1 + return pd.DataFrame({"prediction": np.zeros(len(x), dtype=int)}) + + def predict_proba(self, x: pd.DataFrame) -> pd.DataFrame: + self.predict_proba_calls += 1 + return pd.DataFrame( + { + "proba_0": np.array([0.9, 0.2, 0.1, 0.8]), + "proba_1": np.array([0.1, 0.8, 0.9, 0.2]), + } + ) + + +def test_evaluate_predictor_uses_predict_for_label_metrics(): + predictor = _PredictorWithProba() + df = pd.DataFrame({"feature": [1, 2, 3, 4], "target": [0, 1, 0, 1]}) + + score = _evaluate_predictor( + spark=_DummySpark(df), + predictor=predictor, + data_uri="unused", + metric="accuracy", + target_columns=["target"], + group_column=None, + ) + + assert isinstance(score, float) + assert predictor.predict_calls == 1 + assert predictor.predict_proba_calls == 0 + + +def test_evaluate_predictor_uses_predict_proba_for_probability_metrics(): + predictor = _PredictorWithProba() + df = pd.DataFrame({"feature": [1, 2, 3, 4], "target": [0, 1, 1, 0]}) + + score = _evaluate_predictor( + spark=_DummySpark(df), + predictor=predictor, + data_uri="unused", + metric="roc_auc", + target_columns=["target"], + group_column=None, + ) + + assert isinstance(score, float) + assert predictor.predict_calls == 0 + assert predictor.predict_proba_calls == 1 + + +def test_evaluate_predictor_raises_when_probability_metric_missing_predict_proba(): + class _PredictOnly: + def predict(self, x: pd.DataFrame) -> pd.DataFrame: + return pd.DataFrame({"prediction": np.zeros(len(x), dtype=int)}) + + df = pd.DataFrame({"feature": [1, 2, 3, 4], "target": [0, 1, 1, 0]}) + + with pytest.raises(ValueError, match="does not implement predict_proba"): + _evaluate_predictor( + spark=_DummySpark(df), + predictor=_PredictOnly(), + data_uri="unused", + metric="roc_auc", + target_columns=["target"], + group_column=None, + ) diff --git a/tests/unit/test_keras_predictor.py b/tests/unit/test_keras_predictor.py new file mode 100644 index 00000000..845f29fa --- /dev/null +++ b/tests/unit/test_keras_predictor.py @@ -0,0 +1,79 @@ +"""Unit tests for Keras predictor template semantics.""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest + +from plexe.templates.inference.keras_predictor import KerasPredictor + + +class DummyPipeline: + """Minimal pipeline stub for tests.""" + + def transform(self, x): + return x + + +class DummyModel: + """Minimal model stub for tests.""" + + def __init__(self, output): + self._output = output + + def predict(self, x, verbose=0): + return self._output + + +def test_keras_probabilities_from_binary_logits() -> None: + predictor = KerasPredictor.__new__(KerasPredictor) + predictor._task_type = "binary_classification" + predictor._loss_class = "BinaryCrossentropy" + predictor._loss_config = {"from_logits": True} + + probs = predictor._probabilities_from_raw(np.array([[-2.0], [0.0], [2.0]])) + + assert probs.shape == (3, 2) + assert np.allclose(probs[:, 0] + probs[:, 1], np.ones(3)) + assert probs[0, 1] < probs[1, 1] < probs[2, 1] + + +def test_keras_probabilities_from_binary_two_logit_output() -> None: + predictor = KerasPredictor.__new__(KerasPredictor) + predictor._task_type = "binary_classification" + predictor._loss_class = "BinaryCrossentropy" + predictor._loss_config = {"from_logits": True} + + probs = predictor._probabilities_from_raw(np.array([[2.0, -2.0], [-2.0, 2.0]])) + + assert probs.shape == (2, 2) + assert np.isclose(probs[0].sum(), 1.0) + assert np.isclose(probs[1].sum(), 1.0) + assert probs[0, 0] > probs[0, 1] + assert probs[1, 1] > probs[1, 0] + + +def test_keras_probabilities_from_multiclass_logits() -> None: + predictor = KerasPredictor.__new__(KerasPredictor) + predictor._task_type = "multiclass_classification" + predictor._loss_class = "SparseCategoricalCrossentropy" + predictor._loss_config = {"from_logits": True} + + probs = predictor._probabilities_from_raw(np.array([[1.0, 2.0, 3.0]])) + + assert probs.shape == (1, 3) + assert np.isclose(probs.sum(), 1.0) + assert np.argmax(probs, axis=1)[0] == 2 + + +def test_keras_predict_proba_raises_for_regression() -> None: + predictor = KerasPredictor.__new__(KerasPredictor) + predictor._task_type = "regression" + predictor._loss_class = "MeanSquaredError" + predictor._loss_config = {} + predictor.pipeline = DummyPipeline() + predictor.model = DummyModel(np.array([[0.5], [0.7]])) + + with pytest.raises(ValueError, match="only valid for classification"): + predictor.predict_proba(pd.DataFrame({"f1": [1.0, 2.0]})) diff --git a/tests/unit/test_lightgbm_predictor.py b/tests/unit/test_lightgbm_predictor.py index ede2dad7..475d26ef 100644 --- a/tests/unit/test_lightgbm_predictor.py +++ b/tests/unit/test_lightgbm_predictor.py @@ -9,6 +9,7 @@ import joblib import numpy as np import pandas as pd +import pytest from sklearn.preprocessing import LabelEncoder from plexe.templates.inference.lightgbm_predictor import LightGBMPredictor @@ -21,6 +22,13 @@ def predict(self, x): return np.zeros(len(x), dtype=int) +class DummyClassificationModel(DummyModel): + """Minimal model stub with predict_proba for classification.""" + + def predict_proba(self, x): + return np.tile(np.array([[0.7, 0.3]]), (len(x), 1)) + + class DummyPipeline: """Minimal pipeline stub with a transform method.""" @@ -43,6 +51,12 @@ def _write_artifacts(base_dir: Path, with_encoder: bool = False) -> Path: return artifacts_dir +def _write_metadata(base_dir: Path, task_type: str) -> None: + artifacts_dir = base_dir / "artifacts" + metadata_path = artifacts_dir / "metadata.json" + metadata_path.write_text(f'{{"task_type": "{task_type}"}}', encoding="utf-8") + + def test_lightgbm_predictor_basic(tmp_path: Path) -> None: _write_artifacts(tmp_path) @@ -64,3 +78,28 @@ def test_lightgbm_predictor_label_encoder(tmp_path: Path) -> None: predictions = predictor.predict(input_df)["prediction"].tolist() assert predictions == ["no", "no"] + + +def test_lightgbm_predictor_predict_proba_classification(tmp_path: Path) -> None: + artifacts_dir = _write_artifacts(tmp_path) + joblib.dump(DummyClassificationModel(), artifacts_dir / "model.pkl") + _write_metadata(tmp_path, "binary_classification") + + predictor = LightGBMPredictor(str(tmp_path)) + input_df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + + probabilities = predictor.predict_proba(input_df) + + assert list(probabilities.columns) == ["proba_0", "proba_1"] + assert len(probabilities) == 2 + + +def test_lightgbm_predictor_predict_proba_raises_for_regression(tmp_path: Path) -> None: + _write_artifacts(tmp_path) + _write_metadata(tmp_path, "regression") + + predictor = LightGBMPredictor(str(tmp_path)) + input_df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + + with pytest.raises(ValueError, match="only valid for classification"): + predictor.predict_proba(input_df) diff --git a/tests/unit/test_pytorch_predictor.py b/tests/unit/test_pytorch_predictor.py new file mode 100644 index 00000000..cb12cc38 --- /dev/null +++ b/tests/unit/test_pytorch_predictor.py @@ -0,0 +1,50 @@ +"""Unit tests for PyTorch predictor template semantics.""" + +from __future__ import annotations + +import pandas as pd +import pytest + +torch = pytest.importorskip("torch") + +from plexe.templates.inference.pytorch_predictor import PyTorchPredictor + + +class DummyPipeline: + """Minimal pipeline stub for tests.""" + + def transform(self, x): + return x.values + + +class DummyModel: + """Minimal callable model stub for tests.""" + + def __init__(self, outputs): + self._outputs = outputs + + def __call__(self, x_tensor): + return self._outputs + + +def test_pytorch_predict_proba_binary_classification() -> None: + predictor = PyTorchPredictor.__new__(PyTorchPredictor) + predictor._task_type = "binary_classification" + predictor.pipeline = DummyPipeline() + predictor.model = DummyModel(torch.tensor([[-2.0], [2.0]], dtype=torch.float32)) + + probabilities = predictor.predict_proba(pd.DataFrame({"f1": [0.0, 1.0]})) + + assert list(probabilities.columns) == ["proba_0", "proba_1"] + assert len(probabilities) == 2 + assert probabilities.iloc[0]["proba_1"] < probabilities.iloc[1]["proba_1"] + + +def test_pytorch_predict_proba_raises_for_regression() -> None: + predictor = PyTorchPredictor.__new__(PyTorchPredictor) + predictor._task_type = "regression" + predictor.pipeline = DummyPipeline() + predictor.model = DummyModel(torch.tensor([[0.2], [0.8]], dtype=torch.float32)) + + with pytest.raises(ValueError, match="only valid for classification"): + predictor.predict_proba(pd.DataFrame({"f1": [0.0, 1.0]})) diff --git a/tests/unit/test_xgboost_predictor.py b/tests/unit/test_xgboost_predictor.py new file mode 100644 index 00000000..5237eb75 --- /dev/null +++ b/tests/unit/test_xgboost_predictor.py @@ -0,0 +1,60 @@ +"""Unit tests for XGBoost predictor template.""" + +from __future__ import annotations + +from pathlib import Path + +import joblib +import numpy as np +import pandas as pd +import pytest + +from plexe.templates.inference.xgboost_predictor import XGBoostPredictor + + +class DummyModel: + """Minimal predictor stub for tests.""" + + def predict(self, x): + return np.zeros(len(x), dtype=int) + + def predict_proba(self, x): + return np.tile(np.array([[0.6, 0.4]]), (len(x), 1)) + + +class DummyPipeline: + """Minimal pipeline stub for tests.""" + + def transform(self, x): + return x + + +def _write_artifacts(base_dir: Path, task_type: str) -> None: + artifacts_dir = base_dir / "artifacts" + artifacts_dir.mkdir(parents=True, exist_ok=True) + + joblib.dump(DummyModel(), artifacts_dir / "model.pkl") + joblib.dump(DummyPipeline(), artifacts_dir / "pipeline.pkl") + (artifacts_dir / "metadata.json").write_text(f'{{"task_type": "{task_type}"}}', encoding="utf-8") + + +def test_xgboost_predictor_predict_proba_classification(tmp_path: Path) -> None: + _write_artifacts(tmp_path, "binary_classification") + + predictor = XGBoostPredictor(str(tmp_path)) + input_df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + + probabilities = predictor.predict_proba(input_df) + + assert list(probabilities.columns) == ["proba_0", "proba_1"] + assert len(probabilities) == 2 + + +def test_xgboost_predictor_predict_proba_raises_for_regression(tmp_path: Path) -> None: + _write_artifacts(tmp_path, "regression") + + predictor = XGBoostPredictor(str(tmp_path)) + input_df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + + with pytest.raises(ValueError, match="only valid for classification"): + predictor.predict_proba(input_df) From 4467ae05ca6f929ef9b9a6617ee865b266cdfd6e Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 02:48:16 +0000 Subject: [PATCH 02/12] chore: bump version to 1.4.3 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 970d0c83..aee2a460 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "plexe" -version = "1.4.1" +version = "1.4.3" description = "An agentic framework for building ML models from natural language" authors = [ "Marcello De Bernardi ", From e7cf6e6a49448cfc96df90d8c285faae8285a608 Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 02:50:37 +0000 Subject: [PATCH 03/12] fix: move baseline predict_proba requirement into critical guidance --- plexe/CODE_INDEX.md | 2 +- plexe/agents/baseline_builder.py | 6 +++--- tests/CODE_INDEX.md | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md index 19a31ee6..0446bb01 100644 --- a/plexe/CODE_INDEX.md +++ b/plexe/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: plexe -> Generated on 2026-03-03 02:35:53 +> Generated on 2026-03-03 02:50:38 Code structure and public interface documentation for the **plexe** package. diff --git a/plexe/agents/baseline_builder.py b/plexe/agents/baseline_builder.py index 81163f6b..0954b348 100644 --- a/plexe/agents/baseline_builder.py +++ b/plexe/agents/baseline_builder.py @@ -57,8 +57,8 @@ def _build_agent(self, val_sample_df) -> CodeAgent: feedback_section = format_user_feedback_for_prompt(self.context.scratch.get("_user_feedback")) requires_proba = metric_requires_probabilities(self.context.metric.name) proba_requirement = ( - "6. Because the selected primary metric requires probabilities, your class MUST also implement\n" - " `predict_proba(self, x: pd.DataFrame) -> np.ndarray | pd.DataFrame` that returns per-sample scores.\n" + "- Because the selected primary metric requires probabilities, your class MUST also implement\n" + " `predict_proba(self, x: pd.DataFrame) -> np.ndarray | pd.DataFrame` that returns per-sample scores.\n" if requires_proba else "" ) @@ -132,11 +132,11 @@ def _build_agent(self, val_sample_df) -> CodeAgent: "3. Instantiate and validate: validate_baseline_predictor(predictor, name, description)\n" "4. Once the predictor is successfully validated, save its code as string: save_baseline_code(code_string)\n" "5. Call final_answer() with rationale\n" - f"{proba_requirement}" "\n" "## CRITICAL:\n" "- Use task_analysis['output_targets'] to identify target column(s)\n" "- Predictor must have standard .predict(X) -> array interface\n" + f"{proba_requirement}" ), model=PlexeLiteLLMModel( model_id=self.llm_model, diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md index 807b17b7..e5af336d 100644 --- a/tests/CODE_INDEX.md +++ b/tests/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: tests -> Generated on 2026-03-03 02:35:53 +> Generated on 2026-03-03 02:50:38 Test suite structure and test case documentation. From 38556d34873bbe6e3e91709510401d5fcca07ee1 Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 03:03:48 +0000 Subject: [PATCH 04/12] fix: handle lower-metric early stopping without baseline --- plexe/CODE_INDEX.md | 2 +- plexe/search/evolutionary_search_policy.py | 8 ++++---- tests/CODE_INDEX.md | 3 ++- .../search/test_evolutionary_policy_determinism.py | 12 ++++++++++++ 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md index 0446bb01..aa2e159a 100644 --- a/plexe/CODE_INDEX.md +++ b/plexe/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: plexe -> Generated on 2026-03-03 02:50:38 +> Generated on 2026-03-03 03:03:49 Code structure and public interface documentation for the **plexe** package. diff --git a/plexe/search/evolutionary_search_policy.py b/plexe/search/evolutionary_search_policy.py index a1a331a2..4fb13d5d 100644 --- a/plexe/search/evolutionary_search_policy.py +++ b/plexe/search/evolutionary_search_policy.py @@ -277,11 +277,11 @@ def should_stop(self, journal: SearchJournal, iteration: int, max_iterations: in best = journal.best_performance if journal.optimization_direction == "higher": - has_good_performance = best > baseline * 1.1 - has_exceptional_performance = best > baseline * 1.5 + has_good_performance = best > baseline * 1.1 if baseline > 0 else best > 0 + has_exceptional_performance = best > baseline * 1.5 if baseline > 0 else False else: - has_good_performance = best < baseline * 0.9 - has_exceptional_performance = best < baseline * 0.5 + has_good_performance = best < baseline * 0.9 if baseline > 0 else best >= 0 + has_exceptional_performance = best < baseline * 0.5 if baseline > 0 else False # Stop if highly stagnant AND we have a good solution (>10% improvement over baseline) if stagnation > 0.8 and has_good_performance and len(journal.good_nodes) >= 2: diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md index e5af336d..61fac477 100644 --- a/tests/CODE_INDEX.md +++ b/tests/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: tests -> Generated on 2026-03-03 02:50:38 +> Generated on 2026-03-03 03:03:49 Test suite structure and test case documentation. @@ -108,6 +108,7 @@ Determinism tests for EvolutionarySearchPolicy local RNG behavior. **Functions:** - `test_evolutionary_policy_determinism(monkeypatch, tmp_path)` - No description - `test_evolutionary_exploit_respects_lower_metric_direction(tmp_path)` - No description +- `test_should_stop_lower_metric_without_baseline_can_early_stop()` - No description --- ## `unit/search/test_insight_store.py` diff --git a/tests/unit/search/test_evolutionary_policy_determinism.py b/tests/unit/search/test_evolutionary_policy_determinism.py index e76b4738..7978685b 100644 --- a/tests/unit/search/test_evolutionary_policy_determinism.py +++ b/tests/unit/search/test_evolutionary_policy_determinism.py @@ -68,3 +68,15 @@ def test_evolutionary_exploit_respects_lower_metric_direction(tmp_path): assert selected is not None assert selected.solution_id == 3 + + +def test_should_stop_lower_metric_without_baseline_can_early_stop(): + journal = SearchJournal(optimization_direction="lower") + journal.baseline_performance = 0.0 + for idx, perf in enumerate([0.9, 0.8, 0.7], start=1): + journal.add_node(_make_solution(idx, performance=perf)) + + policy = EvolutionarySearchPolicy(num_drafts=2, seed=456) + policy._calculate_stagnation = MagicMock(return_value=0.9) + + assert policy.should_stop(journal, iteration=5, max_iterations=10) From 4523e3adc3e70668fd62274bc11b32368843ba4e Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 03:16:31 +0000 Subject: [PATCH 05/12] fix: harden probability normalization shape checks --- plexe/CODE_INDEX.md | 2 +- plexe/helpers.py | 5 +++-- tests/CODE_INDEX.md | 4 +++- tests/unit/test_helpers.py | 29 +++++++++++++++++++++++++++++ 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md index dbaf142f..4da4510a 100644 --- a/plexe/CODE_INDEX.md +++ b/plexe/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: plexe -> Generated on 2026-03-03 03:12:15 +> Generated on 2026-03-03 03:16:32 Code structure and public interface documentation for the **plexe** package. diff --git a/plexe/helpers.py b/plexe/helpers.py index e348fce2..d6771627 100644 --- a/plexe/helpers.py +++ b/plexe/helpers.py @@ -119,10 +119,11 @@ def normalize_probability_predictions(y_true: np.ndarray, y_pred_proba: Any, met if probabilities.shape[1] == 1: probabilities = np.column_stack([1 - probabilities[:, 0], probabilities[:, 0]]) - if n_classes <= 2: + is_multiclass = probabilities.shape[1] > 2 or n_classes > 2 + if not is_multiclass: return probabilities[:, 1] - if probabilities.shape[1] < n_classes and metric in PROBABILITY_METRICS: + if probabilities.shape[1] != n_classes and metric in PROBABILITY_METRICS: raise ValueError( f"Metric '{metric_name}' requires {n_classes} class probabilities, got {probabilities.shape[1]} columns." ) diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md index cb1287ec..67dac6de 100644 --- a/tests/CODE_INDEX.md +++ b/tests/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: tests -> Generated on 2026-03-03 03:12:15 +> Generated on 2026-03-03 03:16:32 Test suite structure and test case documentation. @@ -219,6 +219,8 @@ Unit tests for workflow helper functions. - `test_normalize_probability_predictions_binary_matrix_to_positive_scores()` - No description - `test_normalize_probability_predictions_multiclass_keeps_matrix()` - No description - `test_normalize_probability_predictions_multiclass_raises_on_1d()` - No description +- `test_normalize_probability_predictions_multiclass_raises_on_extra_columns()` - No description +- `test_normalize_probability_predictions_raises_when_validation_missing_class()` - No description - `test_evaluate_predictor_uses_predict_for_label_metrics()` - No description - `test_evaluate_predictor_uses_predict_proba_for_probability_metrics()` - No description - `test_evaluate_predictor_raises_when_probability_metric_missing_predict_proba()` - No description diff --git a/tests/unit/test_helpers.py b/tests/unit/test_helpers.py index 6f839e92..f7d0fabe 100644 --- a/tests/unit/test_helpers.py +++ b/tests/unit/test_helpers.py @@ -134,6 +134,35 @@ def test_normalize_probability_predictions_multiclass_raises_on_1d(): normalize_probability_predictions(y_true, probs, "log_loss") +def test_normalize_probability_predictions_multiclass_raises_on_extra_columns(): + y_true = np.array([0, 1, 2]) + probs = np.array( + [ + [0.7, 0.1, 0.1, 0.1], + [0.1, 0.7, 0.1, 0.1], + [0.1, 0.1, 0.7, 0.1], + ] + ) + + with pytest.raises(ValueError, match="requires 3 class probabilities, got 4 columns"): + normalize_probability_predictions(y_true, probs, "roc_auc_ovr") + + +def test_normalize_probability_predictions_raises_when_validation_missing_class(): + y_true = np.array([0, 1, 1, 0]) + probs = np.array( + [ + [0.8, 0.1, 0.1], + [0.1, 0.7, 0.2], + [0.2, 0.6, 0.2], + [0.7, 0.2, 0.1], + ] + ) + + with pytest.raises(ValueError, match="requires 2 class probabilities, got 3 columns"): + normalize_probability_predictions(y_true, probs, "log_loss") + + class _DummySparkDataFrame: def __init__(self, pdf: pd.DataFrame): self._pdf = pdf From b7d4983c0d5eb40a427be6e855a67390b0157c87 Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 03:58:34 +0000 Subject: [PATCH 06/12] fix: harden split validation and keras probability handling --- plexe/CODE_INDEX.md | 5 +- plexe/agents/dataset_splitter.py | 10 ++- plexe/helpers.py | 3 +- plexe/templates/inference/keras_predictor.py | 28 +++++-- plexe/tools/submission.py | 28 ++++++- plexe/validation/validators.py | 59 ++++++++++++++- plexe/workflow.py | 13 ++++ tests/CODE_INDEX.md | 13 +++- tests/unit/test_helpers.py | 4 +- tests/unit/test_keras_predictor.py | 13 ++++ .../unit/test_submission_split_validation.py | 73 +++++++++++++++++++ tests/unit/validation/test_validators.py | 46 ++++++++++++ 12 files changed, 276 insertions(+), 19 deletions(-) create mode 100644 tests/unit/test_submission_split_validation.py diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md index 4da4510a..c3ec1855 100644 --- a/plexe/CODE_INDEX.md +++ b/plexe/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: plexe -> Generated on 2026-03-03 03:16:32 +> Generated on 2026-03-03 03:58:35 Code structure and public interface documentation for the **plexe** package. @@ -547,7 +547,7 @@ Submission tools for agents. - `get_register_statistical_profile_tool(context: BuildContext)` - Factory: Returns statistical profile submission tool. - `get_register_layout_tool(context: BuildContext)` - Factory: Returns layout detection submission tool. - `get_register_eda_report_tool(context: BuildContext)` - Factory: Returns EDA report submission tool. -- `get_save_split_uris_tool(context: BuildContext)` - Factory: Returns split URI submission tool. +- `get_save_split_uris_tool(context: BuildContext, spark: Any | None, expected_ratios: dict[str, float] | None)` - Factory: Returns split URI submission tool. - `get_save_sample_uris_tool(context: BuildContext)` - Factory: Returns sample URIs submission tool. - `get_save_metric_implementation_fn(context: BuildContext)` - Factory: Returns metric implementation submission function. - `get_validate_baseline_predictor_tool(context: BuildContext, val_sample_df)` - Factory: Returns baseline predictor validation tool. @@ -702,6 +702,7 @@ OpenTelemetry tracing decorators for agents and tools. Validation functions for pipelines, models, and other agent outputs. **Functions:** +- `canonicalize_split_ratios(split_ratios: dict[str, float] | None) -> dict[str, float]` - Normalize split ratio key aliases to canonical names. - `validate_sklearn_pipeline(pipeline: Pipeline, sample_df: pd.DataFrame, target_columns: list[str]) -> tuple[bool, str]` - Validate that an sklearn Pipeline is well-formed and functional. - `validate_pipeline_consistency(pipeline: Pipeline, train_sample: pd.DataFrame, val_sample: pd.DataFrame, target_columns: list[str]) -> tuple[bool, str]` - Validate pipeline produces consistent output shape on train/val samples. - `validate_xgboost_params(params: dict[str, Any]) -> tuple[bool, str]` - Validate XGBoost hyperparameters. diff --git a/plexe/agents/dataset_splitter.py b/plexe/agents/dataset_splitter.py index 18fc551a..5d4837a2 100644 --- a/plexe/agents/dataset_splitter.py +++ b/plexe/agents/dataset_splitter.py @@ -53,10 +53,12 @@ def __init__(self, spark: SparkSession, dataset_uri: str, context: BuildContext, self.config = config self.llm_model = config.dataset_splitting_llm - def _build_agent(self) -> CodeAgent: + def _build_agent(self, split_ratios: dict[str, float]) -> CodeAgent: """Build CodeAgent with splitting tool.""" # Get routing configuration for this agent's model api_base, headers = get_routing_for_model(self.config.routing_config, self.llm_model) + # TODO(splitter-prompts): Make split instructions conditional on requested split mode. + # 2-way modes should not instruct writing test.parquet or passing test_uri. return CodeAgent( name="DatasetSplitter", @@ -142,7 +144,7 @@ def _build_agent(self) -> CodeAgent: extra_headers=headers, ), verbosity_level=self.config.agent_verbosity_level, - tools=[get_save_split_uris_tool(self.context)], + tools=[get_save_split_uris_tool(self.context, self.spark, split_ratios)], add_base_tools=False, additional_authorized_imports=self.config.allowed_base_imports + [ @@ -189,7 +191,7 @@ def run(self, split_ratios: dict[str, float], output_dir: str | Path) -> tuple[s output_dir_str = str(output_dir) # Build agent - agent = self._build_agent() + agent = self._build_agent(split_ratios) # Build task prompt (use string version of output_dir) task = self._build_task_prompt(split_ratios, output_dir_str) @@ -263,6 +265,8 @@ def _build_task_prompt(self, split_ratios: dict[str, float], output_dir: str) -> prompt += ( "\n" + # TODO(splitter-prompts): Make this task prompt explicitly 2-way vs 3-way. + # Current wording always asks for train/val/test outputs, which can induce accidental holdouts. "Based on the task type and data characteristics, choose the appropriate splitting strategy:\n" "- Classification → Stratified split (preserve class balance)\n" "- Forecasting future events/values → Chronological split (train on past, test on future)\n" diff --git a/plexe/helpers.py b/plexe/helpers.py index d6771627..ff99c9d0 100644 --- a/plexe/helpers.py +++ b/plexe/helpers.py @@ -125,7 +125,8 @@ def normalize_probability_predictions(y_true: np.ndarray, y_pred_proba: Any, met if probabilities.shape[1] != n_classes and metric in PROBABILITY_METRICS: raise ValueError( - f"Metric '{metric_name}' requires {n_classes} class probabilities, got {probabilities.shape[1]} columns." + f"Probability matrix has {probabilities.shape[1]} columns but validation labels contain {n_classes} " + f"distinct classes for metric '{metric_name}'. Ensure validation data includes all expected classes." ) return probabilities diff --git a/plexe/templates/inference/keras_predictor.py b/plexe/templates/inference/keras_predictor.py index 6af1587d..0e5b6b72 100644 --- a/plexe/templates/inference/keras_predictor.py +++ b/plexe/templates/inference/keras_predictor.py @@ -66,14 +66,14 @@ def __init__(self, model_dir: str): def _uses_logits_output(self) -> bool: """Return True when model outputs are logits based on training loss metadata.""" - from_logits = bool(self._loss_config.get("from_logits", False)) + from_logits = self._loss_config.get("from_logits") if self._task_type == "binary_classification" and self._loss_class == "BinaryCrossentropy": - return from_logits + return bool(from_logits) if self._task_type == "multiclass_classification" and self._loss_class in { "SparseCategoricalCrossentropy", "CategoricalCrossentropy", }: - return from_logits + return bool(from_logits) return False def _probabilities_from_raw(self, raw_predictions): @@ -83,17 +83,33 @@ def _probabilities_from_raw(self, raw_predictions): probabilities = np.asarray(raw_predictions) if probabilities.ndim == 1: probabilities = probabilities.reshape(-1, 1) + uses_logits = self._uses_logits_output() + + # Legacy model metadata may omit loss_config.from_logits. + # If outputs are clearly outside probability bounds, treat them as logits. + if ( + not uses_logits + and not self._loss_config + and self._task_type + in { + "binary_classification", + "multiclass_classification", + } + ): + finite = probabilities[np.isfinite(probabilities)] + if finite.size > 0 and (finite.min() < 0.0 or finite.max() > 1.0): + uses_logits = True if self._task_type == "binary_classification": if probabilities.shape[1] == 1: positive = probabilities[:, 0] - if self._uses_logits_output(): + if uses_logits: positive = 1.0 / (1.0 + np.exp(-positive)) probabilities = np.column_stack([1 - positive, positive]) return probabilities if probabilities.shape[1] == 2: - if self._uses_logits_output(): + if uses_logits: shifted = probabilities - np.max(probabilities, axis=1, keepdims=True) exp_values = np.exp(shifted) probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True) @@ -102,7 +118,7 @@ def _probabilities_from_raw(self, raw_predictions): raise ValueError(f"Binary classification expects 1 or 2 outputs, got shape {probabilities.shape}") if self._task_type == "multiclass_classification": - if self._uses_logits_output(): + if uses_logits: shifted = probabilities - np.max(probabilities, axis=1, keepdims=True) exp_values = np.exp(shifted) probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True) diff --git a/plexe/tools/submission.py b/plexe/tools/submission.py index 8589f9ca..81e0d8ce 100644 --- a/plexe/tools/submission.py +++ b/plexe/tools/submission.py @@ -17,7 +17,12 @@ from plexe.models import BuildContext, Metric, Hypothesis, TaskType, UnifiedPlan from plexe.search.insight_store import InsightStore from plexe.utils.tracing import tool_span -from plexe.validation.validators import validate_sklearn_pipeline, validate_pipeline_consistency +from plexe.validation.validators import ( + canonicalize_split_ratios, + validate_dataset_splits, + validate_pipeline_consistency, + validate_sklearn_pipeline, +) logger = logging.getLogger(__name__) @@ -787,16 +792,22 @@ def save_eda_report( return save_eda_report -def get_save_split_uris_tool(context: BuildContext): +def get_save_split_uris_tool( + context: BuildContext, spark: Any | None = None, expected_ratios: dict[str, float] | None = None +): """ Factory: Returns split URI submission tool. Args: context: Build context for storing result + spark: Optional SparkSession for immediate split validation + expected_ratios: Optional expected split ratios (train/val/test) Returns: Configured tool """ + normalized_expected = canonicalize_split_ratios(expected_ratios) + expects_test_split = normalized_expected.get("test", 0.0) > 0 @tool @tool_span @@ -820,6 +831,19 @@ def save_split_uris(train_uri: str, val_uri: str, test_uri: str = None) -> str: Returns: Confirmation message """ + if expects_test_split and not test_uri: + raise ValueError("A non-empty test split is required for this run. Provide test_uri.") + + if spark is not None and normalized_expected: + is_valid, error_msg = validate_dataset_splits( + spark=spark, + train_uri=train_uri, + val_uri=val_uri, + test_uri=test_uri, + expected_ratios=normalized_expected, + ) + if not is_valid: + raise ValueError(f"Split validation failed: {error_msg}") # Save URIs to context scratch context.scratch["_train_uri"] = train_uri diff --git a/plexe/validation/validators.py b/plexe/validation/validators.py index c50805c1..0f040761 100644 --- a/plexe/validation/validators.py +++ b/plexe/validation/validators.py @@ -16,6 +16,43 @@ logger = logging.getLogger(__name__) +# ============================================ +# Dataset Split Validation +# ============================================ + + +def canonicalize_split_ratios(split_ratios: dict[str, float] | None) -> dict[str, float]: + """ + Normalize split ratio key aliases to canonical names. + + Canonical keys are: train, val, test. + Unknown keys and non-numeric values are ignored. + """ + if not split_ratios: + return {} + + alias_map = { + "train": "train", + "val": "val", + "valid": "val", + "validation": "val", + "test": "test", + } + + normalized: dict[str, float] = {} + for key, value in split_ratios.items(): + if not isinstance(value, int | float): + continue + + canonical_key = alias_map.get(str(key).strip().lower()) + if canonical_key is None: + continue + + normalized[canonical_key] = float(value) + + return normalized + + # ============================================ # Pipeline Validation # ============================================ @@ -342,6 +379,9 @@ def validate_dataset_splits( Returns: (is_valid, error_message) """ + normalized_expected = canonicalize_split_ratios(expected_ratios) + expects_test_split = normalized_expected.get("test", 0.0) > 0 + # Validate row counts (existence check implicit - count() fails if dataset doesn't exist) try: train_count = spark.read.parquet(train_uri).count() @@ -356,16 +396,31 @@ def validate_dataset_splits( except Exception as e: return False, f"Failed to read split datasets: {e}" + if train_count == 0: + return False, "Train split is empty" + if val_count == 0: + return False, "Validation split is empty" + if expects_test_split: + if not test_uri: + return False, "Test split expected but test URI was not provided" + if test_count == 0: + return False, "Test split is empty but final evaluation requires a non-empty test split" + + if total == 0: + return False, "All splits are empty" + logger.info(f"Split sizes: train={train_count}, val={val_count}, test={test_count}, total={total}") # Check ratios are within reasonable tolerance (10%) actual_ratios = {"train": train_count / total, "val": val_count / total, "test": test_count / total} + # TODO(split-validation): Escalate severe ratio drift to hard failure (not warning-only), + # so agent retries before finishing Phase 2. # Only check splits that exist in actual_ratios (ignore extra keys like "rationale") for split_name in actual_ratios.keys(): - if split_name not in expected_ratios: + if split_name not in normalized_expected: continue - expected = expected_ratios[split_name] + expected = normalized_expected[split_name] actual = actual_ratios[split_name] diff = abs(actual - expected) diff --git a/plexe/workflow.py b/plexe/workflow.py index 3b0b519a..88a709d8 100644 --- a/plexe/workflow.py +++ b/plexe/workflow.py @@ -63,6 +63,7 @@ from plexe.templates.features.pipeline_runner import transform_dataset_via_spark from plexe.templates.packaging.model_card_template import generate_model_card from plexe.helpers import evaluate_on_sample, select_viable_model_types +from plexe.validation.validators import canonicalize_split_ratios logger = logging.getLogger(__name__) @@ -917,6 +918,16 @@ def prepare_data( else: # Default fallback split_ratios = {"train": 0.7, "val": 0.15, "test": 0.15} + + split_ratios = canonicalize_split_ratios(split_ratios) + if not {"train", "val", "test"}.issubset(split_ratios): + logger.warning( + "Recommended split ratios are missing one of train/val/test (%s); " + "falling back to default 70/15/15 for final evaluation.", + split_ratios, + ) + split_ratios = {"train": 0.7, "val": 0.15, "test": 0.15} + logger.info("Creating train/val/test splits from single dataset (final evaluation enabled)") else: # 2-way split: train/val only @@ -1792,6 +1803,8 @@ def evaluate_final( if context.test_uri: logger.info(f"Loading test sample from {context.test_uri}") test_df_spark = spark.read.parquet(context.test_uri) + # TODO(evaluation-guard): Fail fast with a clear message when test split is empty, + # instead of letting downstream evaluator phases fail indirectly. # Sample for evaluation (20k-50k rows) sample_size = min(50000, test_df_spark.count()) test_sample_df = test_df_spark.limit(sample_size).toPandas() diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md index 67dac6de..fb245e70 100644 --- a/tests/CODE_INDEX.md +++ b/tests/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: tests -> Generated on 2026-03-03 03:16:32 +> Generated on 2026-03-03 03:58:35 Test suite structure and test case documentation. @@ -247,6 +247,7 @@ Unit tests for Keras predictor template semantics. - `test_keras_probabilities_from_binary_logits() -> None` - No description - `test_keras_probabilities_from_binary_two_logit_output() -> None` - No description - `test_keras_probabilities_from_multiclass_logits() -> None` - No description +- `test_keras_probabilities_infer_logits_when_loss_config_missing() -> None` - No description - `test_keras_predict_proba_raises_for_regression() -> None` - No description --- @@ -300,6 +301,14 @@ Unit tests for PyTorch model submission. **Functions:** - `test_save_model_pytorch(tmp_path)` - Test PyTorch model submission validation and context scratch storage. +--- +## `unit/test_submission_split_validation.py` +Unit tests for split URI submission validation. + +**Functions:** +- `test_save_split_uris_requires_test_when_expected(tmp_path)` - No description +- `test_save_split_uris_canonicalizes_validation_key(tmp_path)` - No description + --- ## `unit/test_xgboost_predictor.py` Unit tests for XGBoost predictor template. @@ -372,6 +381,8 @@ Unit tests for validation functions. - `test_validate_model_definition_unknown_type()` - Test unknown model type fails validation. - `test_validate_metric_function_object_success()` - Callable with correct signature should pass. - `test_validate_metric_function_object_bad_signature()` - Callable with wrong arg names should fail. +- `test_canonicalize_split_ratios_maps_validation_alias()` - No description +- `test_validate_dataset_splits_fails_when_expected_test_is_empty()` - No description --- ## `unit/workflow/test_checkpoint_resume_feedback.py` diff --git a/tests/unit/test_helpers.py b/tests/unit/test_helpers.py index f7d0fabe..4e0a7f23 100644 --- a/tests/unit/test_helpers.py +++ b/tests/unit/test_helpers.py @@ -144,7 +144,7 @@ def test_normalize_probability_predictions_multiclass_raises_on_extra_columns(): ] ) - with pytest.raises(ValueError, match="requires 3 class probabilities, got 4 columns"): + with pytest.raises(ValueError, match="Probability matrix has 4 columns"): normalize_probability_predictions(y_true, probs, "roc_auc_ovr") @@ -159,7 +159,7 @@ def test_normalize_probability_predictions_raises_when_validation_missing_class( ] ) - with pytest.raises(ValueError, match="requires 2 class probabilities, got 3 columns"): + with pytest.raises(ValueError, match="validation labels contain 2 distinct classes"): normalize_probability_predictions(y_true, probs, "log_loss") diff --git a/tests/unit/test_keras_predictor.py b/tests/unit/test_keras_predictor.py index 845f29fa..5964338d 100644 --- a/tests/unit/test_keras_predictor.py +++ b/tests/unit/test_keras_predictor.py @@ -67,6 +67,19 @@ def test_keras_probabilities_from_multiclass_logits() -> None: assert np.argmax(probs, axis=1)[0] == 2 +def test_keras_probabilities_infer_logits_when_loss_config_missing() -> None: + predictor = KerasPredictor.__new__(KerasPredictor) + predictor._task_type = "binary_classification" + predictor._loss_class = "BinaryCrossentropy" + predictor._loss_config = {} + + probs = predictor._probabilities_from_raw(np.array([[-2.0], [2.0]])) + + assert probs.shape == (2, 2) + assert np.allclose(probs[:, 0] + probs[:, 1], np.ones(2)) + assert probs[0, 1] < probs[1, 1] + + def test_keras_predict_proba_raises_for_regression() -> None: predictor = KerasPredictor.__new__(KerasPredictor) predictor._task_type = "regression" diff --git a/tests/unit/test_submission_split_validation.py b/tests/unit/test_submission_split_validation.py new file mode 100644 index 00000000..5af9eb6a --- /dev/null +++ b/tests/unit/test_submission_split_validation.py @@ -0,0 +1,73 @@ +"""Unit tests for split URI submission validation.""" + +from __future__ import annotations + +import pytest + +from plexe.models import BuildContext +from plexe.tools.submission import get_save_split_uris_tool + + +class _DummySparkFrame: + def __init__(self, count: int): + self._count = count + + def count(self) -> int: + return self._count + + +class _DummySparkReader: + def __init__(self, counts: dict[str, int]): + self._counts = counts + + def parquet(self, uri: str) -> _DummySparkFrame: + if uri not in self._counts: + raise ValueError(f"Unknown URI: {uri}") + return _DummySparkFrame(self._counts[uri]) + + +class _DummySpark: + def __init__(self, counts: dict[str, int]): + self.read = _DummySparkReader(counts) + + +def _make_context(tmp_path) -> BuildContext: + return BuildContext( + user_id="user", + experiment_id="exp", + dataset_uri="dataset.parquet", + work_dir=tmp_path, + intent="predict transported", + ) + + +def test_save_split_uris_requires_test_when_expected(tmp_path): + context = _make_context(tmp_path) + spark = _DummySpark({"train_uri": 80, "val_uri": 20}) + + save_split_uris = get_save_split_uris_tool( + context=context, + spark=spark, + expected_ratios={"train": 0.7, "val": 0.15, "test": 0.15}, + ) + + with pytest.raises(ValueError, match="non-empty test split is required"): + save_split_uris(train_uri="train_uri", val_uri="val_uri") + + +def test_save_split_uris_canonicalizes_validation_key(tmp_path): + context = _make_context(tmp_path) + spark = _DummySpark({"train_uri": 80, "val_uri": 20}) + + save_split_uris = get_save_split_uris_tool( + context=context, + spark=spark, + expected_ratios={"train": 0.8, "validation": 0.2}, + ) + + message = save_split_uris(train_uri="train_uri", val_uri="val_uri") + + assert "saved successfully" in message.lower() + assert context.scratch["_train_uri"] == "train_uri" + assert context.scratch["_val_uri"] == "val_uri" + assert context.scratch["_test_uri"] is None diff --git a/tests/unit/validation/test_validators.py b/tests/unit/validation/test_validators.py index 985f79a9..e20c03e4 100644 --- a/tests/unit/validation/test_validators.py +++ b/tests/unit/validation/test_validators.py @@ -7,6 +7,8 @@ from sklearn.preprocessing import StandardScaler from plexe.validation.validators import ( + canonicalize_split_ratios, + validate_dataset_splits, validate_sklearn_pipeline, validate_xgboost_params, validate_model_definition, @@ -127,3 +129,47 @@ def metric_fn(a, b): assert not is_valid assert "Arguments must be named" in error + + +class _DummySparkFrame: + def __init__(self, count: int): + self._count = count + + def count(self) -> int: + return self._count + + +class _DummySparkReader: + def __init__(self, counts: dict[str, int]): + self._counts = counts + + def parquet(self, uri: str) -> _DummySparkFrame: + if uri not in self._counts: + raise ValueError(f"Unknown URI: {uri}") + return _DummySparkFrame(self._counts[uri]) + + +class _DummySpark: + def __init__(self, counts: dict[str, int]): + self.read = _DummySparkReader(counts) + + +def test_canonicalize_split_ratios_maps_validation_alias(): + ratios = canonicalize_split_ratios({"train": 0.8, "validation": 0.2, "note": "ignored"}) + + assert ratios == {"train": 0.8, "val": 0.2} + + +def test_validate_dataset_splits_fails_when_expected_test_is_empty(): + spark = _DummySpark({"train_uri": 80, "val_uri": 20, "test_uri": 0}) + + is_valid, error = validate_dataset_splits( + spark=spark, + train_uri="train_uri", + val_uri="val_uri", + test_uri="test_uri", + expected_ratios={"train": 0.7, "val": 0.15, "test": 0.15}, + ) + + assert not is_valid + assert "Test split is empty" in error From 4992c0d6eb04740449448da81930254fd0ca32b3 Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 04:10:58 +0000 Subject: [PATCH 07/12] fix: clarify multiclass probability shape mismatch errors --- plexe/CODE_INDEX.md | 2 +- plexe/helpers.py | 10 ++++++++-- tests/CODE_INDEX.md | 3 ++- tests/unit/test_helpers.py | 10 +++++++++- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md index c3ec1855..7266176b 100644 --- a/plexe/CODE_INDEX.md +++ b/plexe/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: plexe -> Generated on 2026-03-03 03:58:35 +> Generated on 2026-03-03 04:10:59 Code structure and public interface documentation for the **plexe** package. diff --git a/plexe/helpers.py b/plexe/helpers.py index ff99c9d0..be7398fd 100644 --- a/plexe/helpers.py +++ b/plexe/helpers.py @@ -116,6 +116,7 @@ def normalize_probability_predictions(y_true: np.ndarray, y_pred_proba: Any, met if probabilities.ndim != 2: raise ValueError(f"Expected probability outputs to be 1D or 2D, got shape {probabilities.shape}") + original_n_cols = probabilities.shape[1] if probabilities.shape[1] == 1: probabilities = np.column_stack([1 - probabilities[:, 0], probabilities[:, 0]]) @@ -124,9 +125,14 @@ def normalize_probability_predictions(y_true: np.ndarray, y_pred_proba: Any, met return probabilities[:, 1] if probabilities.shape[1] != n_classes and metric in PROBABILITY_METRICS: + reported_n_cols = ( + original_n_cols if original_n_cols == 1 and probabilities.shape[1] == 2 else probabilities.shape[1] + ) + column_label = "column" if reported_n_cols == 1 else "columns" raise ValueError( - f"Probability matrix has {probabilities.shape[1]} columns but validation labels contain {n_classes} " - f"distinct classes for metric '{metric_name}'. Ensure validation data includes all expected classes." + f"Probability outputs have {reported_n_cols} {column_label} but validation labels contain {n_classes} " + f"distinct classes for metric '{metric_name}'. For multiclass tasks, predictor.predict_proba " + f"must return one probability column per class." ) return probabilities diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md index fb245e70..42256ee7 100644 --- a/tests/CODE_INDEX.md +++ b/tests/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: tests -> Generated on 2026-03-03 03:58:35 +> Generated on 2026-03-03 04:10:59 Test suite structure and test case documentation. @@ -220,6 +220,7 @@ Unit tests for workflow helper functions. - `test_normalize_probability_predictions_multiclass_keeps_matrix()` - No description - `test_normalize_probability_predictions_multiclass_raises_on_1d()` - No description - `test_normalize_probability_predictions_multiclass_raises_on_extra_columns()` - No description +- `test_normalize_probability_predictions_multiclass_raises_on_single_column_matrix()` - No description - `test_normalize_probability_predictions_raises_when_validation_missing_class()` - No description - `test_evaluate_predictor_uses_predict_for_label_metrics()` - No description - `test_evaluate_predictor_uses_predict_proba_for_probability_metrics()` - No description diff --git a/tests/unit/test_helpers.py b/tests/unit/test_helpers.py index 4e0a7f23..1118653b 100644 --- a/tests/unit/test_helpers.py +++ b/tests/unit/test_helpers.py @@ -144,10 +144,18 @@ def test_normalize_probability_predictions_multiclass_raises_on_extra_columns(): ] ) - with pytest.raises(ValueError, match="Probability matrix has 4 columns"): + with pytest.raises(ValueError, match="Probability outputs have 4 columns"): normalize_probability_predictions(y_true, probs, "roc_auc_ovr") +def test_normalize_probability_predictions_multiclass_raises_on_single_column_matrix(): + y_true = np.array([0, 1, 2]) + probs = np.array([[0.2], [0.5], [0.3]]) + + with pytest.raises(ValueError, match="Probability outputs have 1 column"): + normalize_probability_predictions(y_true, probs, "log_loss") + + def test_normalize_probability_predictions_raises_when_validation_missing_class(): y_true = np.array([0, 1, 1, 0]) probs = np.array( From fdf2dcdf5623483955743f6e7b26552f9f3acf34 Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 04:24:01 +0000 Subject: [PATCH 08/12] fix: handle legacy predictor metadata in predict_proba --- plexe/CODE_INDEX.md | 2 +- plexe/templates/inference/catboost_predictor.py | 2 +- plexe/templates/inference/lightgbm_predictor.py | 2 +- plexe/templates/inference/xgboost_predictor.py | 2 +- plexe/validation/validators.py | 3 --- tests/CODE_INDEX.md | 5 ++++- tests/unit/test_catboost_predictor.py | 17 +++++++++++++++++ tests/unit/test_lightgbm_predictor.py | 13 +++++++++++++ tests/unit/test_xgboost_predictor.py | 13 +++++++++++++ 9 files changed, 51 insertions(+), 8 deletions(-) diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md index 7266176b..3ca660b1 100644 --- a/plexe/CODE_INDEX.md +++ b/plexe/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: plexe -> Generated on 2026-03-03 04:10:59 +> Generated on 2026-03-03 04:24:02 Code structure and public interface documentation for the **plexe** package. diff --git a/plexe/templates/inference/catboost_predictor.py b/plexe/templates/inference/catboost_predictor.py index 439b6edc..addd56af 100644 --- a/plexe/templates/inference/catboost_predictor.py +++ b/plexe/templates/inference/catboost_predictor.py @@ -88,7 +88,7 @@ def predict_proba(self, x: pd.DataFrame) -> pd.DataFrame: Returns: DataFrame with probability columns named proba_0..proba_n. """ - if self._task_type not in {"binary_classification", "multiclass_classification"}: + if self._task_type and self._task_type not in {"binary_classification", "multiclass_classification"}: raise ValueError( f"predict_proba() is only valid for classification tasks, got task_type='{self._task_type or 'unknown'}'" ) diff --git a/plexe/templates/inference/lightgbm_predictor.py b/plexe/templates/inference/lightgbm_predictor.py index 7eaa3ab4..3e787913 100644 --- a/plexe/templates/inference/lightgbm_predictor.py +++ b/plexe/templates/inference/lightgbm_predictor.py @@ -78,7 +78,7 @@ def predict_proba(self, x: pd.DataFrame) -> pd.DataFrame: Returns: DataFrame with probability columns named proba_0..proba_n. """ - if self._task_type not in {"binary_classification", "multiclass_classification"}: + if self._task_type and self._task_type not in {"binary_classification", "multiclass_classification"}: raise ValueError( f"predict_proba() is only valid for classification tasks, got task_type='{self._task_type or 'unknown'}'" ) diff --git a/plexe/templates/inference/xgboost_predictor.py b/plexe/templates/inference/xgboost_predictor.py index 9968a688..029d707b 100644 --- a/plexe/templates/inference/xgboost_predictor.py +++ b/plexe/templates/inference/xgboost_predictor.py @@ -78,7 +78,7 @@ def predict_proba(self, x: pd.DataFrame) -> pd.DataFrame: Returns: DataFrame with probability columns named proba_0..proba_n. """ - if self._task_type not in {"binary_classification", "multiclass_classification"}: + if self._task_type and self._task_type not in {"binary_classification", "multiclass_classification"}: raise ValueError( f"predict_proba() is only valid for classification tasks, got task_type='{self._task_type or 'unknown'}'" ) diff --git a/plexe/validation/validators.py b/plexe/validation/validators.py index 0f040761..1de237d6 100644 --- a/plexe/validation/validators.py +++ b/plexe/validation/validators.py @@ -406,9 +406,6 @@ def validate_dataset_splits( if test_count == 0: return False, "Test split is empty but final evaluation requires a non-empty test split" - if total == 0: - return False, "All splits are empty" - logger.info(f"Split sizes: train={train_count}, val={val_count}, test={test_count}, total={total}") # Check ratios are within reasonable tolerance (10%) diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md index 42256ee7..a98f5232 100644 --- a/tests/CODE_INDEX.md +++ b/tests/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: tests -> Generated on 2026-03-03 04:10:59 +> Generated on 2026-03-03 04:24:02 Test suite structure and test case documentation. @@ -190,6 +190,7 @@ Unit tests for CatBoost predictor template. **Functions:** - `test_catboost_predictor_predict_proba_classification() -> None` - No description +- `test_catboost_predictor_predict_proba_allows_missing_task_metadata() -> None` - No description - `test_catboost_predictor_predict_proba_raises_for_regression() -> None` - No description --- @@ -268,6 +269,7 @@ Unit tests for LightGBM predictor template. - `test_lightgbm_predictor_basic(tmp_path: Path) -> None` - No description - `test_lightgbm_predictor_label_encoder(tmp_path: Path) -> None` - No description - `test_lightgbm_predictor_predict_proba_classification(tmp_path: Path) -> None` - No description +- `test_lightgbm_predictor_predict_proba_without_metadata(tmp_path: Path) -> None` - No description - `test_lightgbm_predictor_predict_proba_raises_for_regression(tmp_path: Path) -> None` - No description --- @@ -323,6 +325,7 @@ Unit tests for XGBoost predictor template. **Functions:** - `test_xgboost_predictor_predict_proba_classification(tmp_path: Path) -> None` - No description +- `test_xgboost_predictor_predict_proba_without_metadata(tmp_path: Path) -> None` - No description - `test_xgboost_predictor_predict_proba_raises_for_regression(tmp_path: Path) -> None` - No description --- diff --git a/tests/unit/test_catboost_predictor.py b/tests/unit/test_catboost_predictor.py index c7c32a31..65f07125 100644 --- a/tests/unit/test_catboost_predictor.py +++ b/tests/unit/test_catboost_predictor.py @@ -35,6 +35,23 @@ def test_catboost_predictor_predict_proba_classification() -> None: assert len(probabilities) == 2 +def test_catboost_predictor_predict_proba_allows_missing_task_metadata() -> None: + model = catboost.CatBoostClassifier(iterations=5, verbose=False) + X_train = pd.DataFrame({"f1": [0.0, 0.1, 0.9, 1.0], "f2": [0.0, 0.2, 0.8, 1.0]}) + y_train = np.array([0, 0, 1, 1]) + model.fit(X_train, y_train) + + predictor = CatBoostPredictor.__new__(CatBoostPredictor) + predictor._task_type = "" + predictor.model = model + predictor.pipeline = DummyPipeline() + + probabilities = predictor.predict_proba(pd.DataFrame({"f1": [0.05, 0.95], "f2": [0.05, 0.95]})) + + assert list(probabilities.columns) == ["proba_0", "proba_1"] + assert len(probabilities) == 2 + + def test_catboost_predictor_predict_proba_raises_for_regression() -> None: predictor = CatBoostPredictor.__new__(CatBoostPredictor) predictor._task_type = "regression" diff --git a/tests/unit/test_lightgbm_predictor.py b/tests/unit/test_lightgbm_predictor.py index 475d26ef..b9d835ae 100644 --- a/tests/unit/test_lightgbm_predictor.py +++ b/tests/unit/test_lightgbm_predictor.py @@ -94,6 +94,19 @@ def test_lightgbm_predictor_predict_proba_classification(tmp_path: Path) -> None assert len(probabilities) == 2 +def test_lightgbm_predictor_predict_proba_without_metadata(tmp_path: Path) -> None: + artifacts_dir = _write_artifacts(tmp_path) + joblib.dump(DummyClassificationModel(), artifacts_dir / "model.pkl") + + predictor = LightGBMPredictor(str(tmp_path)) + input_df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + + probabilities = predictor.predict_proba(input_df) + + assert list(probabilities.columns) == ["proba_0", "proba_1"] + assert len(probabilities) == 2 + + def test_lightgbm_predictor_predict_proba_raises_for_regression(tmp_path: Path) -> None: _write_artifacts(tmp_path) _write_metadata(tmp_path, "regression") diff --git a/tests/unit/test_xgboost_predictor.py b/tests/unit/test_xgboost_predictor.py index 5237eb75..d2880464 100644 --- a/tests/unit/test_xgboost_predictor.py +++ b/tests/unit/test_xgboost_predictor.py @@ -50,6 +50,19 @@ def test_xgboost_predictor_predict_proba_classification(tmp_path: Path) -> None: assert len(probabilities) == 2 +def test_xgboost_predictor_predict_proba_without_metadata(tmp_path: Path) -> None: + _write_artifacts(tmp_path, "binary_classification") + (tmp_path / "artifacts" / "metadata.json").unlink() + + predictor = XGBoostPredictor(str(tmp_path)) + input_df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + + probabilities = predictor.predict_proba(input_df) + + assert list(probabilities.columns) == ["proba_0", "proba_1"] + assert len(probabilities) == 2 + + def test_xgboost_predictor_predict_proba_raises_for_regression(tmp_path: Path) -> None: _write_artifacts(tmp_path, "regression") From 342ab190e0e29497ad81810113133700a667ad3f Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 04:33:05 +0000 Subject: [PATCH 09/12] fix: allow pytorch proba inference without metadata --- plexe/CODE_INDEX.md | 2 +- plexe/search/journal.py | 7 ++++++- plexe/templates/inference/pytorch_predictor.py | 2 +- tests/CODE_INDEX.md | 3 ++- tests/unit/test_pytorch_predictor.py | 12 ++++++++++++ 5 files changed, 22 insertions(+), 4 deletions(-) diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md index 3ca660b1..050b16ab 100644 --- a/plexe/CODE_INDEX.md +++ b/plexe/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: plexe -> Generated on 2026-03-03 04:24:02 +> Generated on 2026-03-03 04:33:06 Code structure and public interface documentation for the **plexe** package. diff --git a/plexe/search/journal.py b/plexe/search/journal.py index 1bd92450..4f00312a 100644 --- a/plexe/search/journal.py +++ b/plexe/search/journal.py @@ -124,7 +124,12 @@ def best_node(self) -> Solution | None: good = self.good_nodes if not good: return None - return max(good, key=self.sort_key) + + best = good[0] + for candidate in good[1:]: + if self.is_better(candidate.performance, best.performance): + best = candidate + return best @property def best_performance(self) -> float: diff --git a/plexe/templates/inference/pytorch_predictor.py b/plexe/templates/inference/pytorch_predictor.py index b43fd6d4..71ce858b 100644 --- a/plexe/templates/inference/pytorch_predictor.py +++ b/plexe/templates/inference/pytorch_predictor.py @@ -109,7 +109,7 @@ def predict_proba(self, x: pd.DataFrame) -> pd.DataFrame: Applies sigmoid for single-logit binary models, otherwise softmax. """ - if self._task_type not in {"binary_classification", "multiclass_classification"}: + if self._task_type and self._task_type not in {"binary_classification", "multiclass_classification"}: raise ValueError( f"predict_proba() is only valid for classification tasks, got task_type='{self._task_type or 'unknown'}'" ) diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md index a98f5232..b0f43357 100644 --- a/tests/CODE_INDEX.md +++ b/tests/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: tests -> Generated on 2026-03-03 04:24:02 +> Generated on 2026-03-03 04:33:06 Test suite structure and test case documentation. @@ -295,6 +295,7 @@ Unit tests for PyTorch predictor template semantics. **Functions:** - `test_pytorch_predict_proba_binary_classification() -> None` - No description +- `test_pytorch_predict_proba_allows_missing_task_metadata() -> None` - No description - `test_pytorch_predict_proba_raises_for_regression() -> None` - No description --- diff --git a/tests/unit/test_pytorch_predictor.py b/tests/unit/test_pytorch_predictor.py index cb12cc38..a7522325 100644 --- a/tests/unit/test_pytorch_predictor.py +++ b/tests/unit/test_pytorch_predictor.py @@ -40,6 +40,18 @@ def test_pytorch_predict_proba_binary_classification() -> None: assert probabilities.iloc[0]["proba_1"] < probabilities.iloc[1]["proba_1"] +def test_pytorch_predict_proba_allows_missing_task_metadata() -> None: + predictor = PyTorchPredictor.__new__(PyTorchPredictor) + predictor._task_type = "" + predictor.pipeline = DummyPipeline() + predictor.model = DummyModel(torch.tensor([[-2.0], [2.0]], dtype=torch.float32)) + + probabilities = predictor.predict_proba(pd.DataFrame({"f1": [0.0, 1.0]})) + + assert list(probabilities.columns) == ["proba_0", "proba_1"] + assert len(probabilities) == 2 + + def test_pytorch_predict_proba_raises_for_regression() -> None: predictor = PyTorchPredictor.__new__(PyTorchPredictor) predictor._task_type = "regression" From 89b525728ef67fdcfdcc47f19ddc6604020f474f Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 04:46:10 +0000 Subject: [PATCH 10/12] fix: validate direction assignment and legacy keras proba --- plexe/CODE_INDEX.md | 4 +++- plexe/search/journal.py | 15 ++++++++++++--- plexe/templates/inference/keras_predictor.py | 18 +++++++++++------- tests/CODE_INDEX.md | 4 +++- tests/unit/search/test_journal.py | 7 +++++++ tests/unit/test_keras_predictor.py | 15 +++++++++++++++ 6 files changed, 51 insertions(+), 12 deletions(-) diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md index 050b16ab..34c537fd 100644 --- a/plexe/CODE_INDEX.md +++ b/plexe/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: plexe -> Generated on 2026-03-03 04:33:06 +> Generated on 2026-03-03 04:46:10 Code structure and public interface documentation for the **plexe** package. @@ -396,6 +396,8 @@ Search journal for tracking model search tree. **`SearchJournal`** - Tracks solution search tree. - `__init__(self, baseline: Baseline | None, optimization_direction: str)` +- `optimization_direction(self) -> str` - Metric optimization direction, constrained to {'higher', 'lower'}. +- `optimization_direction(self, value: str) -> None` - Validate and set optimization direction. - `selection_score(self, value: float) -> float` - Normalize a metric value so larger always means better. - `is_better(self, candidate: float, reference: float | None) -> bool` - Compare two metric values using the configured optimization direction. - `sort_key(self, node: Solution) -> float` - Direction-aware sort key for solution nodes. diff --git a/plexe/search/journal.py b/plexe/search/journal.py index 4f00312a..d374301c 100644 --- a/plexe/search/journal.py +++ b/plexe/search/journal.py @@ -40,9 +40,6 @@ def __init__(self, baseline: Baseline | None = None, optimization_direction: str baseline: Baseline model for comparison optimization_direction: Metric optimization direction ("higher" or "lower") """ - if optimization_direction not in {"higher", "lower"}: - raise ValueError(f"optimization_direction must be 'higher' or 'lower', got: {optimization_direction}") - self.baseline = baseline self.baseline_performance = baseline.performance if baseline else 0.0 self.optimization_direction = optimization_direction @@ -51,6 +48,18 @@ def __init__(self, baseline: Baseline | None = None, optimization_direction: str self.successful_attempts = 0 self.failed_attempts = 0 + @property + def optimization_direction(self) -> str: + """Metric optimization direction, constrained to {'higher', 'lower'}.""" + return self._optimization_direction + + @optimization_direction.setter + def optimization_direction(self, value: str) -> None: + """Validate and set optimization direction.""" + if value not in {"higher", "lower"}: + raise ValueError(f"optimization_direction must be 'higher' or 'lower', got: {value}") + self._optimization_direction = value + def selection_score(self, value: float) -> float: """Normalize a metric value so larger always means better.""" return value if self.optimization_direction == "higher" else -value diff --git a/plexe/templates/inference/keras_predictor.py b/plexe/templates/inference/keras_predictor.py index 0e5b6b72..78309b27 100644 --- a/plexe/templates/inference/keras_predictor.py +++ b/plexe/templates/inference/keras_predictor.py @@ -64,12 +64,13 @@ def __init__(self, model_dir: str): with open(artifacts_dir / "pipeline.pkl", "rb") as f: self.pipeline = cloudpickle.load(f) - def _uses_logits_output(self) -> bool: + def _uses_logits_output(self, task_type: str | None = None) -> bool: """Return True when model outputs are logits based on training loss metadata.""" + effective_task_type = task_type or self._task_type from_logits = self._loss_config.get("from_logits") - if self._task_type == "binary_classification" and self._loss_class == "BinaryCrossentropy": + if effective_task_type == "binary_classification" and self._loss_class == "BinaryCrossentropy": return bool(from_logits) - if self._task_type == "multiclass_classification" and self._loss_class in { + if effective_task_type == "multiclass_classification" and self._loss_class in { "SparseCategoricalCrossentropy", "CategoricalCrossentropy", }: @@ -83,14 +84,17 @@ def _probabilities_from_raw(self, raw_predictions): probabilities = np.asarray(raw_predictions) if probabilities.ndim == 1: probabilities = probabilities.reshape(-1, 1) - uses_logits = self._uses_logits_output() + task_type = self._task_type + if not task_type: + task_type = "binary_classification" if probabilities.shape[1] <= 2 else "multiclass_classification" + uses_logits = self._uses_logits_output(task_type) # Legacy model metadata may omit loss_config.from_logits. # If outputs are clearly outside probability bounds, treat them as logits. if ( not uses_logits and not self._loss_config - and self._task_type + and task_type in { "binary_classification", "multiclass_classification", @@ -100,7 +104,7 @@ def _probabilities_from_raw(self, raw_predictions): if finite.size > 0 and (finite.min() < 0.0 or finite.max() > 1.0): uses_logits = True - if self._task_type == "binary_classification": + if task_type == "binary_classification": if probabilities.shape[1] == 1: positive = probabilities[:, 0] if uses_logits: @@ -117,7 +121,7 @@ def _probabilities_from_raw(self, raw_predictions): raise ValueError(f"Binary classification expects 1 or 2 outputs, got shape {probabilities.shape}") - if self._task_type == "multiclass_classification": + if task_type == "multiclass_classification": if uses_logits: shifted = probabilities - np.max(probabilities, axis=1, keepdims=True) exp_values = np.exp(shifted) diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md index b0f43357..4f676d9d 100644 --- a/tests/CODE_INDEX.md +++ b/tests/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: tests -> Generated on 2026-03-03 04:33:06 +> Generated on 2026-03-03 04:46:10 Test suite structure and test case documentation. @@ -137,6 +137,7 @@ Unit tests for SearchJournal. - `test_journal_get_history_train_performance_none()` - get_history should include train_performance=None when not set. - `test_journal_serialization_preserves_optimization_direction()` - to_dict/from_dict should preserve optimization_direction. - `test_journal_from_dict_defaults_optimization_direction_to_higher()` - Older checkpoints without optimization_direction should default to higher. +- `test_journal_optimization_direction_setter_validates_values()` - No description --- ## `unit/search/test_tree_policy_determinism.py` @@ -251,6 +252,7 @@ Unit tests for Keras predictor template semantics. - `test_keras_probabilities_from_multiclass_logits() -> None` - No description - `test_keras_probabilities_infer_logits_when_loss_config_missing() -> None` - No description - `test_keras_predict_proba_raises_for_regression() -> None` - No description +- `test_keras_predict_proba_allows_missing_task_metadata() -> None` - No description --- ## `unit/test_lightgbm_predictor.py` diff --git a/tests/unit/search/test_journal.py b/tests/unit/search/test_journal.py index 9f8a31e8..15af68fa 100644 --- a/tests/unit/search/test_journal.py +++ b/tests/unit/search/test_journal.py @@ -232,3 +232,10 @@ def test_journal_from_dict_defaults_optimization_direction_to_higher(): restored = SearchJournal.from_dict(payload) assert restored.optimization_direction == "higher" + + +def test_journal_optimization_direction_setter_validates_values(): + journal = SearchJournal() + + with pytest.raises(ValueError, match="optimization_direction must be 'higher' or 'lower'"): + journal.optimization_direction = "maximize" diff --git a/tests/unit/test_keras_predictor.py b/tests/unit/test_keras_predictor.py index 5964338d..f5645499 100644 --- a/tests/unit/test_keras_predictor.py +++ b/tests/unit/test_keras_predictor.py @@ -90,3 +90,18 @@ def test_keras_predict_proba_raises_for_regression() -> None: with pytest.raises(ValueError, match="only valid for classification"): predictor.predict_proba(pd.DataFrame({"f1": [1.0, 2.0]})) + + +def test_keras_predict_proba_allows_missing_task_metadata() -> None: + predictor = KerasPredictor.__new__(KerasPredictor) + predictor._task_type = "" + predictor._loss_class = "" + predictor._loss_config = {} + predictor.pipeline = DummyPipeline() + predictor.model = DummyModel(np.array([[-2.0], [2.0]])) + + probabilities = predictor.predict_proba(pd.DataFrame({"f1": [1.0, 2.0]})) + + assert list(probabilities.columns) == ["proba_0", "proba_1"] + assert len(probabilities) == 2 + assert probabilities.iloc[0]["proba_1"] < probabilities.iloc[1]["proba_1"] From 3bd245e818efc33be46fa6263428158508fb9708 Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 04:57:13 +0000 Subject: [PATCH 11/12] fix: reject non-finite keras probability outputs --- plexe/CODE_INDEX.md | 2 +- plexe/templates/inference/keras_predictor.py | 2 ++ tests/CODE_INDEX.md | 3 ++- tests/unit/test_keras_predictor.py | 12 ++++++++++++ 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md index 34c537fd..8c226ee6 100644 --- a/plexe/CODE_INDEX.md +++ b/plexe/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: plexe -> Generated on 2026-03-03 04:46:10 +> Generated on 2026-03-03 04:57:14 Code structure and public interface documentation for the **plexe** package. diff --git a/plexe/templates/inference/keras_predictor.py b/plexe/templates/inference/keras_predictor.py index 78309b27..ff9e8333 100644 --- a/plexe/templates/inference/keras_predictor.py +++ b/plexe/templates/inference/keras_predictor.py @@ -84,6 +84,8 @@ def _probabilities_from_raw(self, raw_predictions): probabilities = np.asarray(raw_predictions) if probabilities.ndim == 1: probabilities = probabilities.reshape(-1, 1) + if not np.isfinite(probabilities).all(): + raise ValueError("Keras model outputs contain NaN/Inf values; cannot compute probabilities.") task_type = self._task_type if not task_type: task_type = "binary_classification" if probabilities.shape[1] <= 2 else "multiclass_classification" diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md index 4f676d9d..b07e877d 100644 --- a/tests/CODE_INDEX.md +++ b/tests/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: tests -> Generated on 2026-03-03 04:46:10 +> Generated on 2026-03-03 04:57:14 Test suite structure and test case documentation. @@ -253,6 +253,7 @@ Unit tests for Keras predictor template semantics. - `test_keras_probabilities_infer_logits_when_loss_config_missing() -> None` - No description - `test_keras_predict_proba_raises_for_regression() -> None` - No description - `test_keras_predict_proba_allows_missing_task_metadata() -> None` - No description +- `test_keras_predict_proba_raises_on_non_finite_outputs() -> None` - No description --- ## `unit/test_lightgbm_predictor.py` diff --git a/tests/unit/test_keras_predictor.py b/tests/unit/test_keras_predictor.py index f5645499..67f7b27d 100644 --- a/tests/unit/test_keras_predictor.py +++ b/tests/unit/test_keras_predictor.py @@ -105,3 +105,15 @@ def test_keras_predict_proba_allows_missing_task_metadata() -> None: assert list(probabilities.columns) == ["proba_0", "proba_1"] assert len(probabilities) == 2 assert probabilities.iloc[0]["proba_1"] < probabilities.iloc[1]["proba_1"] + + +def test_keras_predict_proba_raises_on_non_finite_outputs() -> None: + predictor = KerasPredictor.__new__(KerasPredictor) + predictor._task_type = "" + predictor._loss_class = "" + predictor._loss_config = {} + predictor.pipeline = DummyPipeline() + predictor.model = DummyModel(np.array([[np.nan], [np.inf]])) + + with pytest.raises(ValueError, match="contain NaN/Inf"): + predictor.predict_proba(pd.DataFrame({"f1": [1.0, 2.0]})) From 2aac28d4c79bb4ad7ca36a73ec154236b6966bbd Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 05:08:32 +0000 Subject: [PATCH 12/12] refactor: simplify keras logits heuristic --- plexe/CODE_INDEX.md | 2 +- plexe/templates/inference/keras_predictor.py | 3 +-- tests/CODE_INDEX.md | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md index 8c226ee6..f3ee5380 100644 --- a/plexe/CODE_INDEX.md +++ b/plexe/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: plexe -> Generated on 2026-03-03 04:57:14 +> Generated on 2026-03-03 05:08:33 Code structure and public interface documentation for the **plexe** package. diff --git a/plexe/templates/inference/keras_predictor.py b/plexe/templates/inference/keras_predictor.py index ff9e8333..7e3274f4 100644 --- a/plexe/templates/inference/keras_predictor.py +++ b/plexe/templates/inference/keras_predictor.py @@ -102,8 +102,7 @@ def _probabilities_from_raw(self, raw_predictions): "multiclass_classification", } ): - finite = probabilities[np.isfinite(probabilities)] - if finite.size > 0 and (finite.min() < 0.0 or finite.max() > 1.0): + if probabilities.min() < 0.0 or probabilities.max() > 1.0: uses_logits = True if task_type == "binary_classification": diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md index b07e877d..66b0f26a 100644 --- a/tests/CODE_INDEX.md +++ b/tests/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: tests -> Generated on 2026-03-03 04:57:14 +> Generated on 2026-03-03 05:08:33 Test suite structure and test case documentation.