From d0196bbc2b5db34aaad5e34a611a128befd0ca5a Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 00:43:07 +0000 Subject: [PATCH 1/2] fix: calibrate evaluation verdict rubric and baseline context --- plexe/CODE_INDEX.md | 2 +- plexe/agents/model_evaluator.py | 8 ++++++++ tests/CODE_INDEX.md | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/plexe/CODE_INDEX.md b/plexe/CODE_INDEX.md index 752a9420..daafbf81 100644 --- a/plexe/CODE_INDEX.md +++ b/plexe/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: plexe -> Generated on 2026-03-02 22:03:39 +> Generated on 2026-03-03 00:43:08 Code structure and public interface documentation for the **plexe** package. diff --git a/plexe/agents/model_evaluator.py b/plexe/agents/model_evaluator.py index 7c098e34..735a93d3 100644 --- a/plexe/agents/model_evaluator.py +++ b/plexe/agents/model_evaluator.py @@ -260,6 +260,9 @@ def run( ), "baseline_performance": self.context.baseline_performance, # Validation set performance (for reference) "baseline_predictor": self.context.baseline_predictor, # For re-evaluation on test set + "core_metrics_report": self.context.scratch.get( + "_core_metrics_report" + ), # Model's test metrics from Phase 1 } synthesis_args = {**additional_args, **baseline_context} @@ -469,6 +472,11 @@ def _get_phase_synthesis_prompt(task: str, explainability_required: bool) -> str f"2. Synthesize prioritized recommendations (HIGH/MEDIUM/LOW)\\n" f"3. Write executive summary (2-3 sentences)\\n" f"4. Determine deployment readiness\\n\\n" + f"Verdict rubric:\\n" + f"- FAIL: Model is broken, materially worse than heuristic baseline, or has catastrophic/high-severity issues.\\n" + f"- CONDITIONAL_PASS: Model is roughly on par with baseline, or is better than baseline but has material unresolved issues.\\n" + f"- PASS: Model is clearly better than baseline, has no catastrophic/high-severity issues, and is generally robust.\\n" + f"- Do not assign CONDITIONAL_PASS only because minor improvement opportunities exist.\\n\\n" f"Register using:\\n" f"register_final_evaluation_report(\\n" f" verdict='PASS'|'CONDITIONAL_PASS'|'FAIL',\\n" diff --git a/tests/CODE_INDEX.md b/tests/CODE_INDEX.md index 255f6752..7ba9e1ec 100644 --- a/tests/CODE_INDEX.md +++ b/tests/CODE_INDEX.md @@ -1,6 +1,6 @@ # Code Index: tests -> Generated on 2026-03-02 22:03:39 +> Generated on 2026-03-03 00:43:08 Test suite structure and test case documentation. From b068093007504f0e312214a296134d55a7d17d22 Mon Sep 17 00:00:00 2001 From: marcellodebernardi Date: Tue, 3 Mar 2026 00:59:39 +0000 Subject: [PATCH 2/2] chore: bump version to 1.4.2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 970d0c83..b2bf899e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "plexe" -version = "1.4.1" +version = "1.4.2" description = "An agentic framework for building ML models from natural language" authors = [ "Marcello De Bernardi ",