plexe-ai · marcellodebernardi · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/plexe/agents/model_evaluator.py b/plexe/agents/model_evaluator.py
@@ -260,6 +260,9 @@ def run(
             ),
             "baseline_performance": self.context.baseline_performance,  # Validation set performance (for reference)
             "baseline_predictor": self.context.baseline_predictor,  # For re-evaluation on test set
+            "core_metrics_report": self.context.scratch.get(
+                "_core_metrics_report"
+            ),  # Model's test metrics from Phase 1
         }
         synthesis_args = {**additional_args, **baseline_context}
 
@@ -469,6 +472,11 @@ def _get_phase_synthesis_prompt(task: str, explainability_required: bool) -> str
             f"2. Synthesize prioritized recommendations (HIGH/MEDIUM/LOW)\\n"
             f"3. Write executive summary (2-3 sentences)\\n"
             f"4. Determine deployment readiness\\n\\n"
+            f"Verdict rubric:\\n"
+            f"- FAIL: Model is broken, materially worse than heuristic baseline, or has catastrophic/high-severity issues.\\n"
+            f"- CONDITIONAL_PASS: Model is roughly on par with baseline, or is better than baseline but has material unresolved issues.\\n"
+            f"- PASS: Model is clearly better than baseline, has no catastrophic/high-severity issues, and is generally robust.\\n"
+            f"- Do not assign CONDITIONAL_PASS only because minor improvement opportunities exist.\\n\\n"
             f"Register using:\\n"
             f"register_final_evaluation_report(\\n"
             f"    verdict='PASS'|'CONDITIONAL_PASS'|'FAIL',\\n"

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "plexe"
-version = "1.4.1"
+version = "1.4.2"
 description = "An agentic framework for building ML models from natural language"
 authors = [
     "Marcello De Bernardi <mdebernardi@plexe.ai>",