plexe-ai · marcellodebernardi · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/Makefile b/Makefile
@@ -45,6 +45,7 @@ help:
 	@echo ""
 	@echo "📊 Example Datasets:"
 	@echo "  make run-titanic        Run on Titanic dataset (medium)"
+	@echo "  make run-titanic-proba  Run Titanic with probability-focused intent"
 	@echo "  make run-house-prices   Run on House Prices dataset (regression)"
 	@echo ""
 	@echo "🏗️  Building:"
@@ -331,6 +332,32 @@ run-titanic: build
 			--spark-mode local \
 			--enable-final-evaluation
 
+# Spaceship Titanic dataset with probability-focused objective
+.PHONY: run-titanic-proba
+run-titanic-proba: build
+	@echo "📊 Running on Spaceship Titanic dataset (probability-focused)..."
+	$(eval TIMESTAMP := $(shell date +%Y%m%d_%H%M%S))
+	docker run --rm \
+		--add-host=host.docker.internal:host-gateway \
+		$(CONFIG_MOUNT) \
+		$(CONFIG_ENV) \
+		-v $(PWD)/examples/datasets:/data:ro \
+		-v $(PWD)/workdir:/workdir \
+		-e OPENAI_API_KEY=$(OPENAI_API_KEY) \
+		-e ANTHROPIC_API_KEY=$(ANTHROPIC_API_KEY) \
+		-e SPARK_LOCAL_CORES=4 \
+		-e SPARK_DRIVER_MEMORY=4g \
+		plexe:py$(PYTHON_VERSION) \
+		python -m plexe.main \
+			--train-dataset-uri /data/spaceship-titanic/train.parquet \
+			--user-id dev_user \
+			--intent "predict each passenger's probability of being transported; optimize probability quality and ranking" \
+			--experiment-id titanic_proba \
+			--max-iterations 5 \
+			--work-dir /workdir/titanic_proba/$(TIMESTAMP) \
+			--spark-mode local \
+			--enable-final-evaluation
+
 # House Prices dataset (regression)
 .PHONY: run-house-prices
 run-house-prices: build

diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md
@@ -1,6 +1,6 @@
 # Code Index: plexe
 
-> Generated on 2026-03-03 00:06:47
+> Generated on 2026-03-03 05:08:33
 
 Code structure and public interface documentation for the **plexe** package.
 
@@ -222,6 +222,8 @@ Helper functions for workflow.
 
 **Functions:**
 - `select_viable_model_types(data_layout: DataLayout, selected_frameworks: list[str] | None) -> list[str]` - Select viable model types using three-tier filtering.
+- `metric_requires_probabilities(metric_name: str) -> bool` - Return True when a metric requires probability scores instead of hard labels.
+- `normalize_probability_predictions(y_true: np.ndarray, y_pred_proba: Any, metric_name: str) -> np.ndarray` - Normalize probability predictions for sklearn metric compatibility.
 - `evaluate_on_sample(spark: SparkSession, sample_uri: str, model_artifacts_path: Path, model_type: str, metric: str, target_columns: list[str], group_column: str | None, train_sample_uri: str | None) -> tuple[float, float | None]` - Evaluate model on validation sample, optionally also on training sample.
 - `compute_metric_hardcoded(y_true, y_pred, metric_name: str) -> float` - Compute metric using hardcoded sklearn implementations.
 - `compute_metric(y_true, y_pred, metric_name: str, group_ids) -> float` - Compute metric value.
@@ -393,7 +395,12 @@ Insight store for accumulating learnings from search.
 Search journal for tracking model search tree.
 
 **`SearchJournal`** - Tracks solution search tree.
-- `__init__(self, baseline: Baseline | None)`
+- `__init__(self, baseline: Baseline | None, optimization_direction: str)`
+- `optimization_direction(self) -> str` - Metric optimization direction, constrained to {'higher', 'lower'}.
+- `optimization_direction(self, value: str) -> None` - Validate and set optimization direction.
+- `selection_score(self, value: float) -> float` - Normalize a metric value so larger always means better.
+- `is_better(self, candidate: float, reference: float | None) -> bool` - Compare two metric values using the configured optimization direction.
+- `sort_key(self, node: Solution) -> float` - Direction-aware sort key for solution nodes.
 - `add_node(self, node: Solution) -> None` - Add a solution to the journal.
 - `draft_nodes(self) -> list[Solution]` - Get all root nodes (bootstrap solutions without parents).
 - `buggy_nodes(self) -> list[Solution]` - Get all buggy nodes that could be debugged.
@@ -447,6 +454,7 @@ Standard CatBoost predictor - NO Plexe dependencies.
 **`CatBoostPredictor`** - Standalone CatBoost predictor.
 - `__init__(self, model_dir: str)`
 - `predict(self, x: pd.DataFrame) -> pd.DataFrame` - Make predictions on input DataFrame.
+- `predict_proba(self, x: pd.DataFrame) -> pd.DataFrame` - Predict per-class probabilities on input DataFrame.
 
 ---
 ## `templates/inference/keras_predictor.py`
@@ -464,6 +472,7 @@ Standard LightGBM predictor - NO Plexe dependencies.
 **`LightGBMPredictor`** - Standalone LightGBM predictor.
 - `__init__(self, model_dir: str)`
 - `predict(self, x: pd.DataFrame) -> pd.DataFrame` - Make predictions on input DataFrame.
+- `predict_proba(self, x: pd.DataFrame) -> pd.DataFrame` - Predict per-class probabilities on input DataFrame.
 
 ---
 ## `templates/inference/pytorch_predictor.py`
@@ -481,6 +490,7 @@ Standard XGBoost predictor - NO Plexe dependencies.
 **`XGBoostPredictor`** - Standalone XGBoost predictor.
 - `__init__(self, model_dir: str)`
 - `predict(self, x: pd.DataFrame) -> pd.DataFrame` - Make predictions on input DataFrame.
+- `predict_proba(self, x: pd.DataFrame) -> pd.DataFrame` - Predict per-class probabilities on input DataFrame.
 
 ---
 ## `templates/packaging/model_card_template.py`
@@ -539,7 +549,7 @@ Submission tools for agents.
 - `get_register_statistical_profile_tool(context: BuildContext)` - Factory: Returns statistical profile submission tool.
 - `get_register_layout_tool(context: BuildContext)` - Factory: Returns layout detection submission tool.
 - `get_register_eda_report_tool(context: BuildContext)` - Factory: Returns EDA report submission tool.
-- `get_save_split_uris_tool(context: BuildContext)` - Factory: Returns split URI submission tool.
+- `get_save_split_uris_tool(context: BuildContext, spark: Any | None, expected_ratios: dict[str, float] | None)` - Factory: Returns split URI submission tool.
 - `get_save_sample_uris_tool(context: BuildContext)` - Factory: Returns sample URIs submission tool.
 - `get_save_metric_implementation_fn(context: BuildContext)` - Factory: Returns metric implementation submission function.
 - `get_validate_baseline_predictor_tool(context: BuildContext, val_sample_df)` - Factory: Returns baseline predictor validation tool.
@@ -694,6 +704,7 @@ OpenTelemetry tracing decorators for agents and tools.
 Validation functions for pipelines, models, and other agent outputs.
 
 **Functions:**
+- `canonicalize_split_ratios(split_ratios: dict[str, float] | None) -> dict[str, float]` - Normalize split ratio key aliases to canonical names.
 - `validate_sklearn_pipeline(pipeline: Pipeline, sample_df: pd.DataFrame, target_columns: list[str]) -> tuple[bool, str]` - Validate that an sklearn Pipeline is well-formed and functional.
 - `validate_pipeline_consistency(pipeline: Pipeline, train_sample: pd.DataFrame, val_sample: pd.DataFrame, target_columns: list[str]) -> tuple[bool, str]` - Validate pipeline produces consistent output shape on train/val samples.
 - `validate_xgboost_params(params: dict[str, Any]) -> tuple[bool, str]` - Validate XGBoost hyperparameters.

diff --git a/plexe/agents/baseline_builder.py b/plexe/agents/baseline_builder.py
@@ -17,6 +17,7 @@
 
 from plexe.config import Config, get_routing_for_model
 from plexe.constants import DirNames
+from plexe.helpers import compute_metric, metric_requires_probabilities, normalize_probability_predictions
 from plexe.models import BuildContext, Baseline
 from plexe.utils.tracing import agent_span
 from plexe.tools.submission import (
@@ -54,6 +55,13 @@ def _build_agent(self, val_sample_df) -> CodeAgent:
         from plexe.agents.utils import format_user_feedback_for_prompt
 
         feedback_section = format_user_feedback_for_prompt(self.context.scratch.get("_user_feedback"))
+        requires_proba = metric_requires_probabilities(self.context.metric.name)
+        proba_requirement = (
+            "- Because the selected primary metric requires probabilities, your class MUST also implement\n"
+            "  `predict_proba(self, x: pd.DataFrame) -> np.ndarray | pd.DataFrame` that returns per-sample scores.\n"
+            if requires_proba
+            else ""
+        )
 
         # Get routing configuration for this agent's model
         api_base, headers = get_routing_for_model(self.config.routing_config, self.llm_model)
@@ -128,6 +136,7 @@ def _build_agent(self, val_sample_df) -> CodeAgent:
                 "## CRITICAL:\n"
                 "- Use task_analysis['output_targets'] to identify target column(s)\n"
                 "- Predictor must have standard .predict(X) -> array interface\n"
+                f"{proba_requirement}"
             ),
             model=PlexeLiteLLMModel(
                 model_id=self.llm_model,
@@ -164,13 +173,19 @@ def run(self) -> Baseline:
         task_type = self.context.task_analysis.get("task_type", "unknown")
         target_columns = self.context.output_targets
         metric_name = self.context.metric.name
+        proba_note = (
+            "Primary metric requires probabilities, so baseline must implement predict_proba(X)."
+            if metric_requires_probabilities(metric_name)
+            else "Primary metric uses label/value predictions."
+        )
 
         task = (
             f"Create a simple baseline predictor for this ML task.\n\n"
             f"**ML TASK**: {self.context.intent}\n\n"
             f"Task Type: {task_type}\n"
             f"Target Column(s): {target_columns}\n"
             f"Metric: {metric_name}\n\n"
+            f"{proba_note}\n\n"
             f"Build ONE simple baseline (heuristics preferred) that makes sense for this ML task. "
             f"Register it and evaluate performance using the tools provided."
         )
@@ -224,20 +239,28 @@ def _evaluate_performance(self, val_sample_df) -> float:
         Returns:
             Performance metric value
         """
-        from plexe.helpers import compute_metric
-
         # Separate features from target
         target_cols = self.context.output_targets
         feature_cols = [col for col in val_sample_df.columns if col not in target_cols]
 
         X_val = val_sample_df[feature_cols]
         y_val = val_sample_df[target_cols[0]]
 
-        # Make predictions
-        y_pred = self.context.baseline_predictor.predict(X_val)
+        if metric_requires_probabilities(self.context.metric.name):
+            if not hasattr(self.context.baseline_predictor, "predict_proba") or not callable(
+                self.context.baseline_predictor.predict_proba
+            ):
+                raise ValueError(
+                    f"Metric '{self.context.metric.name}' requires probabilities but baseline predictor "
+                    "does not implement predict_proba()."
+                )
+            raw_proba = self.context.baseline_predictor.predict_proba(X_val)
+            y_pred_input = normalize_probability_predictions(y_val.values, raw_proba, self.context.metric.name)
+        else:
+            y_pred_input = self.context.baseline_predictor.predict(X_val)
 
         # Compute metric
-        performance = compute_metric(y_true=y_val.values, y_pred=y_pred, metric_name=self.context.metric.name)
+        performance = compute_metric(y_true=y_val.values, y_pred=y_pred_input, metric_name=self.context.metric.name)
 
         logger.info(f"Baseline performance: {self.context.metric.name}={performance:.4f}")
 

diff --git a/plexe/agents/dataset_splitter.py b/plexe/agents/dataset_splitter.py
@@ -53,10 +53,12 @@ def __init__(self, spark: SparkSession, dataset_uri: str, context: BuildContext,
         self.config = config
         self.llm_model = config.dataset_splitting_llm
 
-    def _build_agent(self) -> CodeAgent:
+    def _build_agent(self, split_ratios: dict[str, float]) -> CodeAgent:
         """Build CodeAgent with splitting tool."""
         # Get routing configuration for this agent's model
         api_base, headers = get_routing_for_model(self.config.routing_config, self.llm_model)
+        # TODO(splitter-prompts): Make split instructions conditional on requested split mode.
+        # 2-way modes should not instruct writing test.parquet or passing test_uri.
 
         return CodeAgent(
             name="DatasetSplitter",
@@ -142,7 +144,7 @@ def _build_agent(self) -> CodeAgent:
                 extra_headers=headers,
             ),
             verbosity_level=self.config.agent_verbosity_level,
-            tools=[get_save_split_uris_tool(self.context)],
+            tools=[get_save_split_uris_tool(self.context, self.spark, split_ratios)],
             add_base_tools=False,
             additional_authorized_imports=self.config.allowed_base_imports
             + [
@@ -189,7 +191,7 @@ def run(self, split_ratios: dict[str, float], output_dir: str | Path) -> tuple[s
             output_dir_str = str(output_dir)
 
         # Build agent
-        agent = self._build_agent()
+        agent = self._build_agent(split_ratios)
 
         # Build task prompt (use string version of output_dir)
         task = self._build_task_prompt(split_ratios, output_dir_str)
@@ -263,6 +265,8 @@ def _build_task_prompt(self, split_ratios: dict[str, float], output_dir: str) ->
 
         prompt += (
             "\n"
+            # TODO(splitter-prompts): Make this task prompt explicitly 2-way vs 3-way.
+            # Current wording always asks for train/val/test outputs, which can induce accidental holdouts.
             "Based on the task type and data characteristics, choose the appropriate splitting strategy:\n"
             "- Classification → Stratified split (preserve class balance)\n"
             "- Forecasting future events/values → Chronological split (train on past, test on future)\n"

diff --git a/plexe/agents/model_evaluator.py b/plexe/agents/model_evaluator.py
@@ -95,8 +95,11 @@ def _build_agent(self, phase_name: str, phase_prompt: str, tools: list) -> CodeA
                 "- primary_metric_name: Name of the primary optimization metric (string)\\n"
                 "- output_targets: list[str] (target column names to exclude from features)\\n\\n"
                 "PREDICTOR INTERFACE:\\n"
-                "The predictor's predict() function takes a pandas DataFrame (features only, no target)\\n"
-                "and returns a pandas DataFrame with a 'prediction' column.\\n\\n"
+                "- predict(X): input is features-only pandas DataFrame (no target columns), returns DataFrame with 'prediction'\\n"
+                "- predict_proba(X): input is features-only pandas DataFrame; classification only; returns per-class probabilities\\n\\n"
+                "CRITICAL METRIC RULE:\\n"
+                "If a metric is probability-based (roc_auc, roc_auc_ovr, roc_auc_ovo, log_loss),\\n"
+                "you MUST compute it from predict_proba() outputs, not from thresholded labels.\\n\\n"
                 "Example usage:\\n"
                 "```python\\n"
                 "# Prepare features (drop target columns using output_targets)\\n"
@@ -106,6 +109,7 @@ def _build_agent(self, phase_name: str, phase_prompt: str, tools: list) -> CodeA
                 "# Generate predictions (returns DataFrame with 'prediction' column)\\n"
                 "predictions_df = predictor.predict(X_test)\\n"
                 "y_pred = predictions_df['prediction'].values\\n"
+                "# For probability metrics, use predictor.predict_proba(X_test)\\n"
                 "```\\n\\n"
                 "## YOUR MISSION:\\n"
                 f"{phase_prompt}\\n\\n"
@@ -329,6 +333,10 @@ def _get_phase_1_prompt(task: str, primary_metric_name: str) -> str:
             f"3. Compute primary metric + 4-6 additional relevant metrics\\n"
             f"4. ENCOURAGED: Compute 95% confidence intervals using bootstrap (1000 samples)\\n"
             f"5. Interpret results - what do these numbers tell us?\\n\\n"
+            f"If computing probability-based metrics (roc_auc, roc_auc_ovr, roc_auc_ovo, log_loss):\\n"
+            f"- Use predictor.predict_proba(X_test), never thresholded class labels\\n"
+            f"- Binary: use positive-class scores\\n"
+            f"- Multiclass: use full per-class probability matrix\\n\\n"
             f"Register using:\\n"
             f"register_core_metrics_report(\\n"
             f"    task_type='...',  # your detected type\\n"

diff --git a/plexe/helpers.py b/plexe/helpers.py
@@ -28,6 +28,13 @@
 
 logger = logging.getLogger(__name__)
 
+PROBABILITY_METRICS = {
+    StandardMetric.ROC_AUC.value,
+    StandardMetric.ROC_AUC_OVR.value,
+    StandardMetric.ROC_AUC_OVO.value,
+    StandardMetric.LOG_LOSS.value,
+}
+
 
 def select_viable_model_types(data_layout: DataLayout, selected_frameworks: list[str] | None = None) -> list[str]:
     """
@@ -85,6 +92,52 @@ def select_viable_model_types(data_layout: DataLayout, selected_frameworks: list
     return viable
 
 
+def metric_requires_probabilities(metric_name: str) -> bool:
+    """Return True when a metric requires probability scores instead of hard labels."""
+    return metric_name.lower().strip() in PROBABILITY_METRICS
+
+
+def normalize_probability_predictions(y_true: np.ndarray, y_pred_proba: Any, metric_name: str) -> np.ndarray:
+    """
+    Normalize probability predictions for sklearn metric compatibility.
+
+    - Binary metrics use positive-class scores (1D array).
+    - Multiclass metrics use full per-class probability matrix (2D array).
+    """
+    probabilities = y_pred_proba.values if hasattr(y_pred_proba, "values") else np.asarray(y_pred_proba)
+    metric = metric_name.lower().strip()
+    n_classes = len(np.unique(y_true))
+
+    if probabilities.ndim == 1:
+        if n_classes > 2:
+            raise ValueError(f"Metric '{metric_name}' requires per-class probabilities for multiclass tasks.")
+        return probabilities
+
+    if probabilities.ndim != 2:
+        raise ValueError(f"Expected probability outputs to be 1D or 2D, got shape {probabilities.shape}")
+
+    original_n_cols = probabilities.shape[1]
+    if probabilities.shape[1] == 1:
+        probabilities = np.column_stack([1 - probabilities[:, 0], probabilities[:, 0]])
+
+    is_multiclass = probabilities.shape[1] > 2 or n_classes > 2
+    if not is_multiclass:
+        return probabilities[:, 1]
+
+    if probabilities.shape[1] != n_classes and metric in PROBABILITY_METRICS:
+        reported_n_cols = (
+            original_n_cols if original_n_cols == 1 and probabilities.shape[1] == 2 else probabilities.shape[1]
+        )
+        column_label = "column" if reported_n_cols == 1 else "columns"
+        raise ValueError(
+            f"Probability outputs have {reported_n_cols} {column_label} but validation labels contain {n_classes} "
+            f"distinct classes for metric '{metric_name}'. For multiclass tasks, predictor.predict_proba "
+            f"must return one probability column per class."
+        )
+
+    return probabilities
+
+
 def evaluate_on_sample(
     spark: SparkSession,
     sample_uri: str,
@@ -170,8 +223,18 @@ def _evaluate_predictor(
 
     X = df.drop(columns=columns_to_drop)
     y = df[target_columns[0]]
-    predictions = predictor.predict(X)["prediction"].values
-    return compute_metric(y, predictions, metric, group_ids=group_ids)
+    if metric_requires_probabilities(metric):
+        if not hasattr(predictor, "predict_proba") or not callable(predictor.predict_proba):
+            raise ValueError(
+                f"Metric '{metric}' requires probability scores, but predictor {type(predictor).__name__} "
+                "does not implement predict_proba()."
+            )
+        raw_probabilities = predictor.predict_proba(X)
+        predictions = normalize_probability_predictions(y.values, raw_probabilities, metric)
+    else:
+        predictions = predictor.predict(X)["prediction"].values
+
+    return compute_metric(y.values, predictions, metric, group_ids=group_ids)
 
 
 def compute_metric_hardcoded(y_true, y_pred, metric_name: str) -> float: