diff --git a/plexe/agents/model_evaluator.py b/plexe/agents/model_evaluator.py index 7c098e34..735a93d3 100644 --- a/plexe/agents/model_evaluator.py +++ b/plexe/agents/model_evaluator.py @@ -260,6 +260,9 @@ def run( ), "baseline_performance": self.context.baseline_performance, # Validation set performance (for reference) "baseline_predictor": self.context.baseline_predictor, # For re-evaluation on test set + "core_metrics_report": self.context.scratch.get( + "_core_metrics_report" + ), # Model's test metrics from Phase 1 } synthesis_args = {**additional_args, **baseline_context} @@ -469,6 +472,11 @@ def _get_phase_synthesis_prompt(task: str, explainability_required: bool) -> str f"2. Synthesize prioritized recommendations (HIGH/MEDIUM/LOW)\\n" f"3. Write executive summary (2-3 sentences)\\n" f"4. Determine deployment readiness\\n\\n" + f"Verdict rubric:\\n" + f"- FAIL: Model is broken, materially worse than heuristic baseline, or has catastrophic/high-severity issues.\\n" + f"- CONDITIONAL_PASS: Model is roughly on par with baseline, or is better than baseline but has material unresolved issues.\\n" + f"- PASS: Model is clearly better than baseline, has no catastrophic/high-severity issues, and is generally robust.\\n" + f"- Do not assign CONDITIONAL_PASS only because minor improvement opportunities exist.\\n\\n" f"Register using:\\n" f"register_final_evaluation_report(\\n" f" verdict='PASS'|'CONDITIONAL_PASS'|'FAIL',\\n" diff --git a/pyproject.toml b/pyproject.toml index 970d0c83..b2bf899e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "plexe" -version = "1.4.1" +version = "1.4.2" description = "An agentic framework for building ML models from natural language" authors = [ "Marcello De Bernardi ",