From d0196bbc2b5db34aaad5e34a611a128befd0ca5a Mon Sep 17 00:00:00 2001
From: marcellodebernardi <marcello.logins@outlook.com>
Date: Tue, 3 Mar 2026 00:43:07 +0000
Subject: [PATCH 1/2] fix: calibrate evaluation verdict rubric and baseline
 context

---
 plexe/CODE_INDEX.md             | 2 +-
 plexe/agents/model_evaluator.py | 8 ++++++++
 tests/CODE_INDEX.md             | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md
index 752a9420..daafbf81 100644
--- a/plexe/CODE_INDEX.md
+++ b/plexe/CODE_INDEX.md
@@ -1,6 +1,6 @@
 # Code Index: plexe
 
-> Generated on 2026-03-02 22:03:39
+> Generated on 2026-03-03 00:43:08
 
 Code structure and public interface documentation for the **plexe** package.
 
diff --git a/plexe/agents/model_evaluator.py b/plexe/agents/model_evaluator.py
index 7c098e34..735a93d3 100644
--- a/plexe/agents/model_evaluator.py
+++ b/plexe/agents/model_evaluator.py
@@ -260,6 +260,9 @@ def run(
             ),
             "baseline_performance": self.context.baseline_performance,  # Validation set performance (for reference)
             "baseline_predictor": self.context.baseline_predictor,  # For re-evaluation on test set
+            "core_metrics_report": self.context.scratch.get(
+                "_core_metrics_report"
+            ),  # Model's test metrics from Phase 1
         }
         synthesis_args = {**additional_args, **baseline_context}
 
@@ -469,6 +472,11 @@ def _get_phase_synthesis_prompt(task: str, explainability_required: bool) -> str
             f"2. Synthesize prioritized recommendations (HIGH/MEDIUM/LOW)\\n"
             f"3. Write executive summary (2-3 sentences)\\n"
             f"4. Determine deployment readiness\\n\\n"
+            f"Verdict rubric:\\n"
+            f"- FAIL: Model is broken, materially worse than heuristic baseline, or has catastrophic/high-severity issues.\\n"
+            f"- CONDITIONAL_PASS: Model is roughly on par with baseline, or is better than baseline but has material unresolved issues.\\n"
+            f"- PASS: Model is clearly better than baseline, has no catastrophic/high-severity issues, and is generally robust.\\n"
+            f"- Do not assign CONDITIONAL_PASS only because minor improvement opportunities exist.\\n\\n"
             f"Register using:\\n"
             f"register_final_evaluation_report(\\n"
             f"    verdict='PASS'|'CONDITIONAL_PASS'|'FAIL',\\n"
diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md
index 255f6752..7ba9e1ec 100644
--- a/tests/CODE_INDEX.md
+++ b/tests/CODE_INDEX.md
@@ -1,6 +1,6 @@
 # Code Index: tests
 
-> Generated on 2026-03-02 22:03:39
+> Generated on 2026-03-03 00:43:08
 
 Test suite structure and test case documentation.
 

From b068093007504f0e312214a296134d55a7d17d22 Mon Sep 17 00:00:00 2001
From: marcellodebernardi <marcello.logins@outlook.com>
Date: Tue, 3 Mar 2026 00:59:39 +0000
Subject: [PATCH 2/2] chore: bump version to 1.4.2

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 970d0c83..b2bf899e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "plexe"
-version = "1.4.1"
+version = "1.4.2"
 description = "An agentic framework for building ML models from natural language"
 authors = [
     "Marcello De Bernardi <mdebernardi@plexe.ai>",