From c46d41e1cea7db238e0fdee45521a6c8b71d80ca Mon Sep 17 00:00:00 2001
From: "Keshav Ramji Keshav.Ramji@ibm.com"
 <keshavr@login4.bluevela.rmf.ibm.com>
Date: Wed, 29 Oct 2025 21:53:13 +0000
Subject: [PATCH 1/3] v1 working

---
 .gitignore                       |   5 +
 cli/eval/__init__.py             |   1 +
 cli/eval/commands.py             |  40 ++++
 cli/eval/runner.py               | 358 +++++++++++++++++++++++++++++++
 cli/m.py                         |   3 +
 mellea/__init__.py               |   3 +-
 mellea/stdlib/test_based_eval.py | 141 ++++++++++++
 7 files changed, 550 insertions(+), 1 deletion(-)
 create mode 100644 cli/eval/__init__.py
 create mode 100644 cli/eval/commands.py
 create mode 100644 cli/eval/runner.py
 create mode 100644 mellea/stdlib/test_based_eval.py

diff --git a/.gitignore b/.gitignore
index 6b5814f3..f391d4a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,8 @@
+# KR files
+kr_results/
+kr_data/
+xet/
+
 # Python-generated files
 __pycache__/
 *.py[oc]
diff --git a/cli/eval/__init__.py b/cli/eval/__init__.py
new file mode 100644
index 00000000..7f625a26
--- /dev/null
+++ b/cli/eval/__init__.py
@@ -0,0 +1 @@
+"""CLI for test-based evaluation"""
diff --git a/cli/eval/commands.py b/cli/eval/commands.py
new file mode 100644
index 00000000..3ca7f4cc
--- /dev/null
+++ b/cli/eval/commands.py
@@ -0,0 +1,40 @@
+import typer
+
+eval_app = typer.Typer(name="eval")
+
+
+def eval_run(
+    test_files: list[str] = typer.Argument(
+        ..., help="List of paths to json/jsonl files containing test cases"
+    ),
+    backend: str = typer.Option("ollama", "--backend", "-b", help="Generation backend"),
+    model: str = typer.Option(None, "--model", help="Generation model name"),
+    judge_backend: str = typer.Option(
+        None, "--judge-backend", "-jb", help="Judge backend"
+    ),
+    judge_model: str = typer.Option(None, "--judge-model", help="Judge model name"),
+    output_path: str = typer.Option(
+        "eval_results", "--output-path", "-o", help="Output path for results"
+    ),
+    output_format: str = typer.Option(
+        "json", "--output-format", help="Either json or jsonl format for results"
+    ),
+    verbose: bool = typer.Option(False, "--verbose", "-v"),
+    continue_on_error: bool = typer.Option(True, "--continue-on-error"),
+):
+    from cli.eval.runner import run_evaluations
+
+    run_evaluations(
+        test_files=test_files,
+        backend=backend,
+        model=model,
+        judge_backend=judge_backend,
+        judge_model=judge_model,
+        output_path=output_path,
+        output_format=output_format,
+        verbose=verbose,
+        continue_on_error=continue_on_error,
+    )
+
+
+eval_app.command("run")(eval_run)
diff --git a/cli/eval/runner.py b/cli/eval/runner.py
new file mode 100644
index 00000000..7b2e7eca
--- /dev/null
+++ b/cli/eval/runner.py
@@ -0,0 +1,358 @@
+import json
+import re
+from pathlib import Path
+from typing import List
+
+import mellea
+from mellea.stdlib.base import ModelOutputThunk
+from mellea.stdlib.requirement import Requirement
+from mellea.stdlib.test_based_eval import TestBasedEval
+from mellea.backends.types import ModelOption
+
+from rich.console import Console
+from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn
+from rich.table import Table
+
+console = Console()
+
+
+class InputEvalResult:
+    """Store results of a single input evaluation (within a unit test)"""
+
+    def __init__(
+        self,
+        input_text: str,
+        model_output: str,
+        validation_passed: bool,
+        score: int,
+        validation_reason: str,
+    ):
+        self.input_text = input_text
+        self.model_output = model_output
+        self.validation_passed = validation_passed
+        self.score = score
+        self.validation_reason = validation_reason
+
+    def to_dict(self):
+        return {
+            "input": self.input_text,
+            "model_output": self.model_output,
+            "passed": self.validation_passed,
+            "score": self.score,
+            "justification": self.validation_reason,
+        }
+
+
+class TestEvalResult:
+    """Store results of a single test evaluation"""
+
+    def __init__(self, test_eval: TestBasedEval, input_results: list[InputEvalResult]):
+        self.test_eval = test_eval
+        self.input_results = input_results
+
+    def to_dict(self):
+        return {
+            "conversation_id": self.test_eval.conversation_id,
+            "category": self.test_eval.category,
+            "input_results": [r.to_dict() for r in self.input_results],
+            "expected_targets": self.test_eval.targets,
+            "unit_test_instructions": self.test_eval.unit_test_instructions,
+            "passed": self.passed_count,
+            "total_count": self.total_count,
+            "pass_rate": self.pass_rate,
+            "metadata": self.test_eval.metadata,
+        }
+
+    @property
+    def passed_count(self) -> int:
+        return sum(1 for r in self.input_results if r.validation_passed)
+
+    @property
+    def total_count(self) -> int:
+        return len(self.input_results)
+
+    @property
+    def pass_rate(self) -> float:
+        return self.passed_count / self.total_count if self.total_count > 0 else 0.0
+
+
+def create_session(backend: str, model: str | None) -> mellea.MelleaSession:
+    """Create a mellea session with the specified backend and  model."""
+
+    model_id = None
+    if model:
+        if model.isupper() or "_" in model:
+            if hasattr(mellea.model_ids, model):
+                model_id = getattr(mellea.model_ids, model)
+            else:
+                model_id = model
+        else:
+            model_id = model
+    else:
+        model_id = mellea.model_ids.IBM_GRANITE_4_MICRO_3B
+
+    try:
+        backend_lower = backend.lower()
+
+        if backend_lower == "ollama":
+            from mellea.backends.ollama import OllamaModelBackend
+
+            backend_instance = OllamaModelBackend(
+                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256}
+            )
+
+        elif backend_lower == "openai":
+            from mellea.backends.openai import OpenAIBackend
+
+            backend_instance = OpenAIBackend(
+                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256}
+            )
+
+        elif backend_lower in ["hf", "huggingface"]:
+            from mellea.backends.huggingface import LocalHFBackend
+
+            backend_instance = LocalHFBackend(
+                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256}
+            )
+
+        elif backend_lower == "watsonx":
+            from mellea.backends.watsonx import WatsonxAIBackend
+
+            backend_instance = WatsonxAIBackend(
+                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256}
+            )
+
+        elif backend_lower == "litellm":
+            from mellea.backends.litellm import LiteLLMBackend
+
+            backend_instance = LiteLLMBackend(
+                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256}
+            )
+
+        else:
+            raise ValueError(
+                f"Unknown backend: {backend}. Supported: ollama, openai, hf, watsonx, litellm"
+            )
+
+        # create session with backend instance
+        from mellea.stdlib.base import SimpleContext
+
+        session = mellea.MelleaSession(
+            backend=backend_instance, ctx=SimpleContext()
+        )  # need to reset to SimpleContext? print what is being judged by the judge (input)
+        return session
+
+    except Exception as e:
+        console.print(
+            f"[red]Error creating session with backend={backend}, model={model_id}: {e}[/red]"
+        )
+        raise
+
+
+def run_evaluations(
+    test_files: List[str],
+    backend: str,
+    model: str | None,
+    judge_backend: str | None,
+    judge_model: str | None,
+    output_path: str,
+    output_format: str,
+    verbose: bool,
+    continue_on_error: bool,
+):
+    """Run all 'unit test' evaluations"""
+    all_test_evals: List[TestBasedEval] = []
+
+    for test_file in test_files:
+        try:
+            test_evals = TestBasedEval.from_json_file(test_file)
+            all_test_evals.extend(test_evals)
+            console.print(f"Loaded {len(test_evals)} test evaluations from {test_file}")
+        except Exception as e:
+            console.print(f"Error loading {test_file}")
+
+    if not all_test_evals:
+        console.print("Failed to load any test evaluations")
+        return
+
+    console.print(f"Total test evals to run: {len(all_test_evals)}")
+
+    console.print(f"Generation model: {model}")
+    console.print(f"Judge model: {judge_model}")
+
+    m = create_session(backend=backend, model=model)
+    judge_session = create_session(backend=judge_backend, model=judge_model)
+
+    all_results = []
+
+    # some visuals on progress with rich, we can take out / modify
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+        console=console,
+    ) as progress:
+        task = progress.add_task("Running evals", total=len(all_test_evals))
+        for test_eval in all_test_evals:
+            try:
+                result = execute_test_eval(
+                    test_eval=test_eval,
+                    generation_session=m,
+                    judge_session=judge_session,
+                )
+                all_results.append(result)
+            except Exception as e:
+                console.print(f"Error {e} on test {test_eval.conversation_id}")
+                if not continue_on_error:
+                    raise
+
+            progress.advance(task)
+
+    summary_stats(all_results)
+    save_results(all_results, output_path, output_format)
+
+    m.cleanup()
+    judge_session.cleanup()
+
+
+def execute_test_eval(
+    test_eval: TestBasedEval,
+    generation_session: mellea.MelleaSession,
+    judge_session: mellea.MelleaSession,
+) -> TestEvalResult:
+    """Execute a single test evaluation
+    For each input in the test, generate a response using generation_session
+    Then, after all inputs are processed, validate using judge_session
+    """
+
+    input_results = []
+
+    # for all inputs, generate responses with generator
+    for input_text in test_eval.inputs:
+        result: ModelOutputThunk = generation_session.act(input_text)
+        model_output = str(result)
+        console.print(model_output)
+
+        judge_session.ctx = judge_session.ctx.add(result)
+
+        requirement = Requirement(
+            description=create_judge_requirement(test_eval, input_text, model_output)
+        )
+        validation_results = judge_session.validate(requirement)
+        validation_result = validation_results[0]
+
+        judge_output = validation_result.reason or ""
+        score, justification = parse_judge_output(judge_output)
+
+        passed = score == 1 if score is not None else validation_result.as_bool()
+
+        input_result = InputEvalResult(
+            input_text=input_text,
+            model_output=model_output,
+            validation_passed=passed,
+            score=score,
+            validation_reason=justification,
+        )
+        input_results.append(input_result)
+
+        # reset both generator and judge -- might not be necessary since SimpleContext doesn't retain history
+        generation_session.reset()
+        judge_session.reset()
+
+    test_result = TestEvalResult(test_eval=test_eval, input_results=input_results)
+    return test_result
+
+
+def create_judge_requirement(
+    test_eval: TestBasedEval, input_text: str, model_output: str
+):
+    """Create judge requirement description"""
+
+    if len(test_eval.targets) == 0:  # no reference
+        target_text = "N/A"  # another way to handle this?
+    elif len(test_eval.targets) == 1:
+        target_text = test_eval.targets[0]
+    else:  # enumerate the multiple targets
+        target_text = "\n".join(
+            [f"{i}. {target}" for i, target in enumerate(test_eval.targets, 1)]
+        )
+
+    judge_prompt = test_eval.judge_prompt.format(
+        input=input_text,
+        prediction=model_output,
+        target=target_text,
+        guidelines=test_eval.unit_test_instructions,
+    )
+
+    return judge_prompt
+
+
+def parse_judge_output(judge_output: str):
+    try:
+        json_match = re.search(r'\{[^}]*"score"[^}]*\}', judge_output, re.DOTALL)
+        if json_match:
+            json_str = json_match.group(0)
+            data = json.loads(json_str)
+            score = data.get("score")
+            justification = data.get("justification")
+            return score, justification
+    except (json.JSONDecodeError, AttributeError):
+        pass
+
+    # if the above fails, search the text for the score
+    score_match = re.search(r'score["\s:]+(\d+)', judge_output, re.IGNORECASE)
+    if score_match:
+        score = int(score_match.group(1))
+        return score, judge_output
+
+    return None, judge_output
+
+
+def save_results(results: List[TestEvalResult], output_path: str, output_format: str):
+    output_path_obj = Path(output_path)
+    if output_path_obj.suffix != f".{output_format}":
+        output_path_obj = Path(f"{output_path}.{output_format}")
+
+    total_inputs = sum(r.total_count for r in results)
+    passed_inputs = sum(r.passed_count for r in results)
+    overall_pass_rate = passed_inputs / total_inputs if total_inputs > 0 else 0.0
+
+    if output_format == "jsonl":
+        with output_path_obj.open("w") as f:
+            for result in results:
+                f.write(json.dumps(result.to_dict()) + "\n")
+    else:  # json
+        summary = {
+            "total_unit_tests": len(results),
+            "total_inputs": total_inputs,
+            "passed_inputs": passed_inputs,
+            "failed_inputs": total_inputs - passed_inputs,
+            "overall_pass_rate": overall_pass_rate,
+        }
+
+        with output_path_obj.open("w") as f:
+            json.dump(
+                {"summary": summary, "results": [r.to_dict() for r in results]},
+                f,
+                indent=2,
+            )
+
+    console.print(f"Results saved to {output_path}")
+
+
+def summary_stats(results: List[TestEvalResult]):
+    total_inputs = sum(r.total_count for r in results)
+    passed_inputs = sum(r.passed_count for r in results)
+    overall_pass_rate = passed_inputs / total_inputs if total_inputs > 0 else 0.0
+
+    console.print(f"Total number of inputs across tests: {total_inputs}")
+    console.print(f"Number of inputs passed across tests: {passed_inputs}")
+    console.print(f"Cumulative Pass Rate: {overall_pass_rate}")
+
+    if len(results) > 1:
+        console.print("Per-Test Breakdown:")
+        for result in results:
+            console.print(
+                f"{result.test_eval.conversation_id}:\n\t{result.passed_count}/{result.total_count} ({result.pass_rate * 100:.1f}%)\n\n"
+            )
diff --git a/cli/m.py b/cli/m.py
index 3aa32aa1..07fc14b9 100644
--- a/cli/m.py
+++ b/cli/m.py
@@ -5,6 +5,7 @@
 from cli.alora.commands import alora_app
 from cli.decompose import app as decompose_app
 from cli.serve.app import serve
+from cli.eval.commands import eval_app
 
 cli = typer.Typer(name="m", no_args_is_help=True)
 
@@ -25,3 +26,5 @@ def callback() -> None:
 # as documented: https://typer.tiangolo.com/tutorial/subcommands/add-typer/#put-them-together.
 cli.add_typer(alora_app)
 cli.add_typer(decompose_app)
+
+cli.add_typer(eval_app)
diff --git a/mellea/__init__.py b/mellea/__init__.py
index a8fc24fa..d100f4d9 100644
--- a/mellea/__init__.py
+++ b/mellea/__init__.py
@@ -3,5 +3,6 @@
 import mellea.backends.model_ids as model_ids
 from mellea.stdlib.genslot import generative
 from mellea.stdlib.session import MelleaSession, start_session
+from mellea.stdlib.test_based_eval import TestBasedEval
 
-__all__ = ["MelleaSession", "generative", "model_ids", "start_session"]
+__all__ = ["MelleaSession", "TestBasedEval", "generative", "model_ids", "start_session"]
diff --git a/mellea/stdlib/test_based_eval.py b/mellea/stdlib/test_based_eval.py
new file mode 100644
index 00000000..fb361e13
--- /dev/null
+++ b/mellea/stdlib/test_based_eval.py
@@ -0,0 +1,141 @@
+"""LLM Evaluation with Unit Tests in Mellea."""
+
+import json
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from mellea.stdlib.base import CBlock, Component, TemplateRepresentation
+
+
+class TestBasedEval(Component):
+    """Each TestBasedEval represents a single unit test."""
+
+    def __init__(
+        self,
+        category: str,
+        judge_prompt: str,
+        inputs: list[str],
+        unit_test_instructions: str,
+        targets: list[str] | None = None,  # can be optional
+        conversation_id: str | None = None,  # in case we change format later
+        metadata: dict[str, Any] | None = None,  # for anything miscellaneous
+    ):
+        """Initialize TestBasedEval (for a single unit test)."""
+        self.category = category
+        self.judge_prompt = judge_prompt
+        self.inputs = inputs
+        self.unit_test_instructions = unit_test_instructions
+        self.targets = targets or []
+        self.conversation_id = conversation_id
+        self.metadata = metadata or {}
+
+    def format_for_llm(self) -> TemplateRepresentation | str:
+        """Format for use by judge LLM session."""
+        return self.category + "\n\n" + self.judge_prompt
+
+        # if making a new jinja template
+        # return TemplateRepresentation(
+        #     obj=self,
+        #     args={
+        #         "category": self.category,
+        #         "judge_prompt": self.judge_prompt,
+        #         "inputs": self.inputs,
+        #         "unit_test_instructions": self.unit_test_instructions,
+        #         "targets": self.targets,
+        #     },
+        #     #template_order=["*"]
+        # )
+
+    @classmethod
+    def from_json_file(cls, filepath: str) -> list["TestBasedEval"]:
+        """Load test evaluations from json/jsonl file, return list of TestBasedEval instances, one per 'unit test'."""
+        path = Path(filepath)
+        test_data = []
+
+        if path.suffix == ".jsonl":
+            with path.open("r") as f:
+                for line in f:
+                    if line.strip():
+                        data = json.loads(line)
+                        if not data.get("skip", False):
+                            test_data.append(data)
+        else:  # '.json'
+            with path.open("r") as f:
+                data = json.load(f)
+                if isinstance(data, list):
+                    test_data = [item for item in data if not item.get("skip", False)]
+
+        # group by conversation_id -- a single 'unit test'
+        grouped = defaultdict(list)
+        for item in test_data:
+            conversation_id = item.get("conversation_id")
+            grouped[conversation_id].append(item)
+
+        test_evals = []
+        for conversation_id, items in grouped.items():
+            first_item = items[0]
+            ut_instructions = first_item.get("unit test instructions", "")
+            category = first_item.get("dataset", path.stem)
+
+            inputs = []
+            for item in items:
+                for turn in item.get("input", []):
+                    if turn.get("speaker") == "user":
+                        inputs.append(turn.get("text"))
+
+            targets = []
+            for item in items:
+                for turn in item.get("targets", []):
+                    if turn.get("speaker") == "agent":
+                        targets.append(turn.get("text"))
+
+            # figure out how to add judge prompt template here
+            judge_prompt = """**Input to the model**
+
+            {input}
+
+            **Model output to be rated**
+
+            {prediction}
+
+            **Ground truth text**
+
+            {target}
+
+            **Rating Guidelines**
+            The model output should adhere to the following guidelines:
+             {guidelines}
+
+            **Scoring Criteria**
+             * Score 0: The model output violates any of the guidelines.
+             * Score 1: The model output is well aligned with the ground truth - if it exists, the input to the model, and adheres to all guidelines.
+
+            **Return Your Rating**
+               Return your rating in the following format:
+               {{\"score\": your_score, \"justification\": \"your_justification\"}}
+
+            Your rating:
+            """
+
+            metadata = {
+                "task_type": first_item.get(
+                    "task_type"
+                ),  # for displaying results maybe?
+                "dataset": first_item.get("dataset"),
+                "items": items,  # keep list of original items for reference
+            }
+
+            test_eval = cls(
+                category=category,
+                judge_prompt=judge_prompt,
+                inputs=inputs,
+                unit_test_instructions=ut_instructions,
+                targets=targets,
+                conversation_id=conversation_id,
+                metadata=metadata,
+            )
+            test_evals.append(test_eval)
+
+        return test_evals

From 21f7960db42175a4f2542867cb73af68bf164258 Mon Sep 17 00:00:00 2001
From: "Keshav Ramji Keshav.Ramji@ibm.com"
 <keshavr@login4.bluevela.rmf.ibm.com>
Date: Thu, 6 Nov 2025 16:33:27 +0000
Subject: [PATCH 2/3] Update v1 data format and judge call

---
 .gitignore                       |   2 +
 cli/eval/commands.py             |   4 +
 cli/eval/runner.py               |  78 +++++++++--------
 mellea/stdlib/test_based_eval.py | 138 +++++++++++--------------------
 4 files changed, 93 insertions(+), 129 deletions(-)

diff --git a/.gitignore b/.gitignore
index f391d4a6..9249c799 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,8 @@
 kr_results/
 kr_data/
 xet/
+job.sh
+hub/
 
 # Python-generated files
 __pycache__/
diff --git a/cli/eval/commands.py b/cli/eval/commands.py
index 3ca7f4cc..65d8e817 100644
--- a/cli/eval/commands.py
+++ b/cli/eval/commands.py
@@ -9,10 +9,12 @@ def eval_run(
     ),
     backend: str = typer.Option("ollama", "--backend", "-b", help="Generation backend"),
     model: str = typer.Option(None, "--model", help="Generation model name"),
+    max_gen_tokens: int = typer.Option(256, "--max-gen-tokens", help="Max tokens to generate for responses"),
     judge_backend: str = typer.Option(
         None, "--judge-backend", "-jb", help="Judge backend"
     ),
     judge_model: str = typer.Option(None, "--judge-model", help="Judge model name"),
+    max_judge_tokens: int = typer.Option(256, "--max-judge-tokens", help="Max tokens for the judge model's judgement."),
     output_path: str = typer.Option(
         "eval_results", "--output-path", "-o", help="Output path for results"
     ),
@@ -28,8 +30,10 @@ def eval_run(
         test_files=test_files,
         backend=backend,
         model=model,
+        max_gen_tokens=max_gen_tokens,
         judge_backend=judge_backend,
         judge_model=judge_model,
+        max_judge_tokens=max_judge_tokens,
         output_path=output_path,
         output_format=output_format,
         verbose=verbose,
diff --git a/cli/eval/runner.py b/cli/eval/runner.py
index 7b2e7eca..2d24ae80 100644
--- a/cli/eval/runner.py
+++ b/cli/eval/runner.py
@@ -5,13 +5,11 @@
 
 import mellea
 from mellea.stdlib.base import ModelOutputThunk
-from mellea.stdlib.requirement import Requirement
 from mellea.stdlib.test_based_eval import TestBasedEval
 from mellea.backends.types import ModelOption
 
 from rich.console import Console
 from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn
-from rich.table import Table
 
 console = Console()
 
@@ -25,7 +23,7 @@ def __init__(
         model_output: str,
         validation_passed: bool,
         score: int,
-        validation_reason: str,
+        validation_reason: str, # add input_id
     ):
         self.input_text = input_text
         self.model_output = model_output
@@ -52,15 +50,15 @@ def __init__(self, test_eval: TestBasedEval, input_results: list[InputEvalResult
 
     def to_dict(self):
         return {
-            "conversation_id": self.test_eval.conversation_id,
-            "category": self.test_eval.category,
+            "test_id": self.test_eval.test_id,
+            "source": self.test_eval.source,
+            "name": self.test_eval.name,
+            "instructions": self.test_eval.instructions,
             "input_results": [r.to_dict() for r in self.input_results],
             "expected_targets": self.test_eval.targets,
-            "unit_test_instructions": self.test_eval.unit_test_instructions,
             "passed": self.passed_count,
             "total_count": self.total_count,
             "pass_rate": self.pass_rate,
-            "metadata": self.test_eval.metadata,
         }
 
     @property
@@ -76,7 +74,7 @@ def pass_rate(self) -> float:
         return self.passed_count / self.total_count if self.total_count > 0 else 0.0
 
 
-def create_session(backend: str, model: str | None) -> mellea.MelleaSession:
+def create_session(backend: str, model: str | None, max_tokens: int | None) -> mellea.MelleaSession:
     """Create a mellea session with the specified backend and  model."""
 
     model_id = None
@@ -98,35 +96,35 @@ def create_session(backend: str, model: str | None) -> mellea.MelleaSession:
             from mellea.backends.ollama import OllamaModelBackend
 
             backend_instance = OllamaModelBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256}
+                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}
             )
 
         elif backend_lower == "openai":
             from mellea.backends.openai import OpenAIBackend
 
             backend_instance = OpenAIBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256}
+                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}
             )
 
         elif backend_lower in ["hf", "huggingface"]:
             from mellea.backends.huggingface import LocalHFBackend
 
             backend_instance = LocalHFBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256}
+                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
             )
 
         elif backend_lower == "watsonx":
             from mellea.backends.watsonx import WatsonxAIBackend
 
             backend_instance = WatsonxAIBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256}
+                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}
             )
 
         elif backend_lower == "litellm":
             from mellea.backends.litellm import LiteLLMBackend
 
             backend_instance = LiteLLMBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256}
+                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}
             )
 
         else:
@@ -139,7 +137,7 @@ def create_session(backend: str, model: str | None) -> mellea.MelleaSession:
 
         session = mellea.MelleaSession(
             backend=backend_instance, ctx=SimpleContext()
-        )  # need to reset to SimpleContext? print what is being judged by the judge (input)
+        ) 
         return session
 
     except Exception as e:
@@ -153,8 +151,10 @@ def run_evaluations(
     test_files: List[str],
     backend: str,
     model: str | None,
+    max_gen_tokens: int | None,
     judge_backend: str | None,
     judge_model: str | None,
+    max_judge_tokens: int | None,
     output_path: str,
     output_format: str,
     verbose: bool,
@@ -176,16 +176,17 @@ def run_evaluations(
         return
 
     console.print(f"Total test evals to run: {len(all_test_evals)}")
+    total_inputs = sum(len(te.inputs) for te in all_test_evals)
+    console.print(f"Total inputs to run: {total_inputs}")
 
     console.print(f"Generation model: {model}")
     console.print(f"Judge model: {judge_model}")
 
-    m = create_session(backend=backend, model=model)
-    judge_session = create_session(backend=judge_backend, model=judge_model)
+    m = create_session(backend=backend, model=model, max_tokens=max_gen_tokens)
+    judge_session = create_session(backend=judge_backend, model=judge_model, max_tokens=max_judge_tokens)
 
     all_results = []
 
-    # some visuals on progress with rich, we can take out / modify
     with Progress(
         SpinnerColumn(),
         TextColumn("[progress.description]{task.description}"),
@@ -203,7 +204,7 @@ def run_evaluations(
                 )
                 all_results.append(result)
             except Exception as e:
-                console.print(f"Error {e} on test {test_eval.conversation_id}")
+                console.print(f"Error {e} on test {test_eval.test_id}")
                 if not continue_on_error:
                     raise
 
@@ -229,23 +230,20 @@ def execute_test_eval(
     input_results = []
 
     # for all inputs, generate responses with generator
-    for input_text in test_eval.inputs:
+    for idx, input_text in enumerate(test_eval.inputs):
         result: ModelOutputThunk = generation_session.act(input_text)
         model_output = str(result)
-        console.print(model_output)
 
         judge_session.ctx = judge_session.ctx.add(result)
 
-        requirement = Requirement(
-            description=create_judge_requirement(test_eval, input_text, model_output)
-        )
-        validation_results = judge_session.validate(requirement)
-        validation_result = validation_results[0]
+        targets_for_input = (test_eval.targets[idx] if idx < len(test_eval.targets) else [])
 
-        judge_output = validation_result.reason or ""
+        # query the judge
+        judge_prompt = create_judge_requirement(test_eval, input_text, model_output, targets_for_input)
+        judge_output_thunk = judge_session.act(judge_prompt)
+        judge_output = str(judge_output_thunk)
         score, justification = parse_judge_output(judge_output)
-
-        passed = score == 1 if score is not None else validation_result.as_bool()
+        passed = score == 1 if score is not None else False
 
         input_result = InputEvalResult(
             input_text=input_text,
@@ -256,7 +254,7 @@ def execute_test_eval(
         )
         input_results.append(input_result)
 
-        # reset both generator and judge -- might not be necessary since SimpleContext doesn't retain history
+        # reset both generator and judge
         generation_session.reset()
         judge_session.reset()
 
@@ -265,24 +263,24 @@ def execute_test_eval(
 
 
 def create_judge_requirement(
-    test_eval: TestBasedEval, input_text: str, model_output: str
+    test_eval: TestBasedEval, input_text: str, model_output: str, targets_for_input: list[str]
 ):
     """Create judge requirement description"""
 
-    if len(test_eval.targets) == 0:  # no reference
-        target_text = "N/A"  # another way to handle this?
-    elif len(test_eval.targets) == 1:
-        target_text = test_eval.targets[0]
-    else:  # enumerate the multiple targets
+    if len(targets_for_input) == 0:  # no reference
+        target_text = "N/A"
+    elif len(targets_for_input) == 1:
+        target_text = targets_for_input[0]
+    else:  # enumerate when there are multiple targets
         target_text = "\n".join(
-            [f"{i}. {target}" for i, target in enumerate(test_eval.targets, 1)]
+            [f"{i}. {target}" for i, target in enumerate(targets_for_input, 1)]
         )
 
     judge_prompt = test_eval.judge_prompt.format(
         input=input_text,
         prediction=model_output,
         target=target_text,
-        guidelines=test_eval.unit_test_instructions,
+        guidelines=test_eval.instructions,
     )
 
     return judge_prompt
@@ -324,7 +322,7 @@ def save_results(results: List[TestEvalResult], output_path: str, output_format:
                 f.write(json.dumps(result.to_dict()) + "\n")
     else:  # json
         summary = {
-            "total_unit_tests": len(results),
+            "total_tests": len(results),
             "total_inputs": total_inputs,
             "passed_inputs": passed_inputs,
             "failed_inputs": total_inputs - passed_inputs,
@@ -348,11 +346,11 @@ def summary_stats(results: List[TestEvalResult]):
 
     console.print(f"Total number of inputs across tests: {total_inputs}")
     console.print(f"Number of inputs passed across tests: {passed_inputs}")
-    console.print(f"Cumulative Pass Rate: {overall_pass_rate}")
+    console.print(f"Cumulative Pass Rate: {overall_pass_rate * 100:.1f}%")
 
     if len(results) > 1:
         console.print("Per-Test Breakdown:")
         for result in results:
             console.print(
-                f"{result.test_eval.conversation_id}:\n\t{result.passed_count}/{result.total_count} ({result.pass_rate * 100:.1f}%)\n\n"
+                f"{result.test_eval.name}:\n\t{result.passed_count}/{result.total_count} ({result.pass_rate * 100:.1f}%)\n\n"
             )
diff --git a/mellea/stdlib/test_based_eval.py b/mellea/stdlib/test_based_eval.py
index fb361e13..8d740b6d 100644
--- a/mellea/stdlib/test_based_eval.py
+++ b/mellea/stdlib/test_based_eval.py
@@ -1,12 +1,10 @@
 """LLM Evaluation with Unit Tests in Mellea."""
 
 import json
-from collections import defaultdict
-from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 
-from mellea.stdlib.base import CBlock, Component, TemplateRepresentation
+from mellea.stdlib.base import Component
 
 
 class TestBasedEval(Component):
@@ -14,85 +12,24 @@ class TestBasedEval(Component):
 
     def __init__(
         self,
-        category: str,
-        judge_prompt: str,
+        source: str,
+        name: str,
+        instructions: str,
         inputs: list[str],
-        unit_test_instructions: str,
-        targets: list[str] | None = None,  # can be optional
-        conversation_id: str | None = None,  # in case we change format later
-        metadata: dict[str, Any] | None = None,  # for anything miscellaneous
+        targets: list[list[str]] | None = None,  # can be optional
+        test_id: str | None = None,
+        input_ids: list[str] | None = None
     ):
         """Initialize TestBasedEval (for a single unit test)."""
-        self.category = category
-        self.judge_prompt = judge_prompt
+        self.source = source
+        self.name = name
+        self.instructions = instructions
         self.inputs = inputs
-        self.unit_test_instructions = unit_test_instructions
         self.targets = targets or []
-        self.conversation_id = conversation_id
-        self.metadata = metadata or {}
-
-    def format_for_llm(self) -> TemplateRepresentation | str:
-        """Format for use by judge LLM session."""
-        return self.category + "\n\n" + self.judge_prompt
-
-        # if making a new jinja template
-        # return TemplateRepresentation(
-        #     obj=self,
-        #     args={
-        #         "category": self.category,
-        #         "judge_prompt": self.judge_prompt,
-        #         "inputs": self.inputs,
-        #         "unit_test_instructions": self.unit_test_instructions,
-        #         "targets": self.targets,
-        #     },
-        #     #template_order=["*"]
-        # )
+        self.test_id = test_id
+        self.input_ids = input_ids or []
 
-    @classmethod
-    def from_json_file(cls, filepath: str) -> list["TestBasedEval"]:
-        """Load test evaluations from json/jsonl file, return list of TestBasedEval instances, one per 'unit test'."""
-        path = Path(filepath)
-        test_data = []
-
-        if path.suffix == ".jsonl":
-            with path.open("r") as f:
-                for line in f:
-                    if line.strip():
-                        data = json.loads(line)
-                        if not data.get("skip", False):
-                            test_data.append(data)
-        else:  # '.json'
-            with path.open("r") as f:
-                data = json.load(f)
-                if isinstance(data, list):
-                    test_data = [item for item in data if not item.get("skip", False)]
-
-        # group by conversation_id -- a single 'unit test'
-        grouped = defaultdict(list)
-        for item in test_data:
-            conversation_id = item.get("conversation_id")
-            grouped[conversation_id].append(item)
-
-        test_evals = []
-        for conversation_id, items in grouped.items():
-            first_item = items[0]
-            ut_instructions = first_item.get("unit test instructions", "")
-            category = first_item.get("dataset", path.stem)
-
-            inputs = []
-            for item in items:
-                for turn in item.get("input", []):
-                    if turn.get("speaker") == "user":
-                        inputs.append(turn.get("text"))
-
-            targets = []
-            for item in items:
-                for turn in item.get("targets", []):
-                    if turn.get("speaker") == "agent":
-                        targets.append(turn.get("text"))
-
-            # figure out how to add judge prompt template here
-            judge_prompt = """**Input to the model**
+        self.judge_prompt = """**Input to the model**
 
             {input}
 
@@ -119,23 +56,46 @@ def from_json_file(cls, filepath: str) -> list["TestBasedEval"]:
             Your rating:
             """
 
-            metadata = {
-                "task_type": first_item.get(
-                    "task_type"
-                ),  # for displaying results maybe?
-                "dataset": first_item.get("dataset"),
-                "items": items,  # keep list of original items for reference
-            }
+    @classmethod
+    def from_json_file(cls, filepath: str) -> list["TestBasedEval"]:
+        """Load test evaluations from json/jsonl file, return list of TestBasedEval instances, one per 'unit test'."""
+        path = Path(filepath)
+
+        with path.open('r') as f:
+            data = json.load(f)
+
+        if not isinstance(data, list):
+            data = [data]
+
+        test_evals = []
+        for test_data in data:
+            examples = test_data.get("examples", [])
+
+            inputs = []
+            targets = []
+            input_ids = []
+
+            for example in examples:
+                input_messages = example.get("input", [])
+                user_messages = [msg for msg in input_messages if msg.get("role") == "user"]
+                if user_messages:
+                    inputs.append(user_messages[-1].get("content", ""))
+
+                target_messages = example.get("targets", [])
+                targets_for_input = [msg.get("content", "") for msg in target_messages if msg.get("role") == "assistant"]
+                targets.append(targets_for_input)
+
+                input_ids.append(example.get("input_id", ""))
 
             test_eval = cls(
-                category=category,
-                judge_prompt=judge_prompt,
+                source=test_data.get("source", "unknown"),
+                name=test_data.get("name", ""),
+                instructions=test_data.get("instructions", ""),
                 inputs=inputs,
-                unit_test_instructions=ut_instructions,
                 targets=targets,
-                conversation_id=conversation_id,
-                metadata=metadata,
+                test_id=test_data.get("id", ""),
+                input_ids=input_ids
             )
             test_evals.append(test_eval)
 
-        return test_evals
+        return test_evals
\ No newline at end of file

From 7919b5bf7896af457e3c9ce0f85a22b7ea55bd2e Mon Sep 17 00:00:00 2001
From: "Keshav Ramji Keshav.Ramji@ibm.com"
 <keshavr@login4.bluevela.rmf.ibm.com>
Date: Thu, 6 Nov 2025 17:29:20 +0000
Subject: [PATCH 3/3] Pre-commit fixes

---
 cli/eval/commands.py             | 10 ++++---
 cli/eval/runner.py               | 47 ++++++++++++++++++++------------
 mellea/stdlib/reqlib/md.py       | 13 +++++++--
 mellea/stdlib/test_based_eval.py | 18 ++++++++----
 4 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/cli/eval/commands.py b/cli/eval/commands.py
index 65d8e817..e4ddea6c 100644
--- a/cli/eval/commands.py
+++ b/cli/eval/commands.py
@@ -9,19 +9,22 @@ def eval_run(
     ),
     backend: str = typer.Option("ollama", "--backend", "-b", help="Generation backend"),
     model: str = typer.Option(None, "--model", help="Generation model name"),
-    max_gen_tokens: int = typer.Option(256, "--max-gen-tokens", help="Max tokens to generate for responses"),
+    max_gen_tokens: int = typer.Option(
+        256, "--max-gen-tokens", help="Max tokens to generate for responses"
+    ),
     judge_backend: str = typer.Option(
         None, "--judge-backend", "-jb", help="Judge backend"
     ),
     judge_model: str = typer.Option(None, "--judge-model", help="Judge model name"),
-    max_judge_tokens: int = typer.Option(256, "--max-judge-tokens", help="Max tokens for the judge model's judgement."),
+    max_judge_tokens: int = typer.Option(
+        256, "--max-judge-tokens", help="Max tokens for the judge model's judgement."
+    ),
     output_path: str = typer.Option(
         "eval_results", "--output-path", "-o", help="Output path for results"
     ),
     output_format: str = typer.Option(
         "json", "--output-format", help="Either json or jsonl format for results"
     ),
-    verbose: bool = typer.Option(False, "--verbose", "-v"),
     continue_on_error: bool = typer.Option(True, "--continue-on-error"),
 ):
     from cli.eval.runner import run_evaluations
@@ -36,7 +39,6 @@ def eval_run(
         max_judge_tokens=max_judge_tokens,
         output_path=output_path,
         output_format=output_format,
-        verbose=verbose,
         continue_on_error=continue_on_error,
     )
 
diff --git a/cli/eval/runner.py b/cli/eval/runner.py
index 2d24ae80..e320e0d0 100644
--- a/cli/eval/runner.py
+++ b/cli/eval/runner.py
@@ -23,7 +23,7 @@ def __init__(
         model_output: str,
         validation_passed: bool,
         score: int,
-        validation_reason: str, # add input_id
+        validation_reason: str,  # add input_id
     ):
         self.input_text = input_text
         self.model_output = model_output
@@ -74,7 +74,9 @@ def pass_rate(self) -> float:
         return self.passed_count / self.total_count if self.total_count > 0 else 0.0
 
 
-def create_session(backend: str, model: str | None, max_tokens: int | None) -> mellea.MelleaSession:
+def create_session(
+    backend: str, model: str | None, max_tokens: int | None
+) -> mellea.MelleaSession:
     """Create a mellea session with the specified backend and  model."""
 
     model_id = None
@@ -96,35 +98,40 @@ def create_session(backend: str, model: str | None, max_tokens: int | None) -> m
             from mellea.backends.ollama import OllamaModelBackend
 
             backend_instance = OllamaModelBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}
+                model_id=model_id,
+                model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
             )
 
         elif backend_lower == "openai":
             from mellea.backends.openai import OpenAIBackend
 
             backend_instance = OpenAIBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}
+                model_id=model_id,
+                model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
             )
 
         elif backend_lower in ["hf", "huggingface"]:
             from mellea.backends.huggingface import LocalHFBackend
 
             backend_instance = LocalHFBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
+                model_id=model_id,
+                model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
             )
 
         elif backend_lower == "watsonx":
             from mellea.backends.watsonx import WatsonxAIBackend
 
             backend_instance = WatsonxAIBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}
+                model_id=model_id,
+                model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
             )
 
         elif backend_lower == "litellm":
             from mellea.backends.litellm import LiteLLMBackend
 
             backend_instance = LiteLLMBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}
+                model_id=model_id,
+                model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
             )
 
         else:
@@ -135,9 +142,7 @@ def create_session(backend: str, model: str | None, max_tokens: int | None) -> m
         # create session with backend instance
         from mellea.stdlib.base import SimpleContext
 
-        session = mellea.MelleaSession(
-            backend=backend_instance, ctx=SimpleContext()
-        ) 
+        session = mellea.MelleaSession(backend=backend_instance, ctx=SimpleContext())
         return session
 
     except Exception as e:
@@ -157,7 +162,6 @@ def run_evaluations(
     max_judge_tokens: int | None,
     output_path: str,
     output_format: str,
-    verbose: bool,
     continue_on_error: bool,
 ):
     """Run all 'unit test' evaluations"""
@@ -176,14 +180,16 @@ def run_evaluations(
         return
 
     console.print(f"Total test evals to run: {len(all_test_evals)}")
-    total_inputs = sum(len(te.inputs) for te in all_test_evals)
+    total_inputs = sum(len(test_eval.inputs) for test_eval in all_test_evals)
     console.print(f"Total inputs to run: {total_inputs}")
 
     console.print(f"Generation model: {model}")
     console.print(f"Judge model: {judge_model}")
 
     m = create_session(backend=backend, model=model, max_tokens=max_gen_tokens)
-    judge_session = create_session(backend=judge_backend, model=judge_model, max_tokens=max_judge_tokens)
+    judge_session = create_session(
+        backend=judge_backend, model=judge_model, max_tokens=max_judge_tokens
+    )
 
     all_results = []
 
@@ -234,12 +240,14 @@ def execute_test_eval(
         result: ModelOutputThunk = generation_session.act(input_text)
         model_output = str(result)
 
-        judge_session.ctx = judge_session.ctx.add(result)
-
-        targets_for_input = (test_eval.targets[idx] if idx < len(test_eval.targets) else [])
+        targets_for_input = (
+            test_eval.targets[idx] if idx < len(test_eval.targets) else []
+        )
 
         # query the judge
-        judge_prompt = create_judge_requirement(test_eval, input_text, model_output, targets_for_input)
+        judge_prompt = create_judge_requirement(
+            test_eval, input_text, model_output, targets_for_input
+        )
         judge_output_thunk = judge_session.act(judge_prompt)
         judge_output = str(judge_output_thunk)
         score, justification = parse_judge_output(judge_output)
@@ -263,7 +271,10 @@ def execute_test_eval(
 
 
 def create_judge_requirement(
-    test_eval: TestBasedEval, input_text: str, model_output: str, targets_for_input: list[str]
+    test_eval: TestBasedEval,
+    input_text: str,
+    model_output: str,
+    targets_for_input: list[str],
 ):
     """Create judge requirement description"""
 
diff --git a/mellea/stdlib/reqlib/md.py b/mellea/stdlib/reqlib/md.py
index 3cee2770..9a1836ed 100644
--- a/mellea/stdlib/reqlib/md.py
+++ b/mellea/stdlib/reqlib/md.py
@@ -14,10 +14,14 @@ def as_markdown_list(ctx: Context) -> list[str] | None:
     raw_output = ctx.last_output()
     assert raw_output is not None
     try:
+        assert raw_output.value is not None
         parsed = mistletoe.Document(raw_output.value)
-        for child in parsed.children:
+        assert parsed.children is not None
+        children = list(parsed.children)
+        for child in children:
             if type(child) is not mistletoe.block_token.List:
                 return None
+        assert child.children is not None
         for item in child.children:
             xs.append(mistletoe.base_renderer.BaseRenderer().render(item))
         return xs
@@ -44,10 +48,13 @@ def _md_table(ctx: Context):
     raw_output = ctx.last_output()
     assert raw_output is not None
     try:
+        assert raw_output.value is not None
         parsed = mistletoe.Document(raw_output.value)
-        if len(parsed.children) != 1:
+        assert parsed.children is not None
+        children = list(parsed.children)
+        if len(children) != 1:
             return False
-        return type(parsed.children[0]) is mistletoe.block_token.Table
+        return type(children[0]) is mistletoe.block_token.Table
     except Exception:
         return False
 
diff --git a/mellea/stdlib/test_based_eval.py b/mellea/stdlib/test_based_eval.py
index 8d740b6d..a51ae414 100644
--- a/mellea/stdlib/test_based_eval.py
+++ b/mellea/stdlib/test_based_eval.py
@@ -18,7 +18,7 @@ def __init__(
         inputs: list[str],
         targets: list[list[str]] | None = None,  # can be optional
         test_id: str | None = None,
-        input_ids: list[str] | None = None
+        input_ids: list[str] | None = None,
     ):
         """Initialize TestBasedEval (for a single unit test)."""
         self.source = source
@@ -61,7 +61,7 @@ def from_json_file(cls, filepath: str) -> list["TestBasedEval"]:
         """Load test evaluations from json/jsonl file, return list of TestBasedEval instances, one per 'unit test'."""
         path = Path(filepath)
 
-        with path.open('r') as f:
+        with path.open("r") as f:
             data = json.load(f)
 
         if not isinstance(data, list):
@@ -77,12 +77,18 @@ def from_json_file(cls, filepath: str) -> list["TestBasedEval"]:
 
             for example in examples:
                 input_messages = example.get("input", [])
-                user_messages = [msg for msg in input_messages if msg.get("role") == "user"]
+                user_messages = [
+                    msg for msg in input_messages if msg.get("role") == "user"
+                ]
                 if user_messages:
                     inputs.append(user_messages[-1].get("content", ""))
 
                 target_messages = example.get("targets", [])
-                targets_for_input = [msg.get("content", "") for msg in target_messages if msg.get("role") == "assistant"]
+                targets_for_input = [
+                    msg.get("content", "")
+                    for msg in target_messages
+                    if msg.get("role") == "assistant"
+                ]
                 targets.append(targets_for_input)
 
                 input_ids.append(example.get("input_id", ""))
@@ -94,8 +100,8 @@ def from_json_file(cls, filepath: str) -> list["TestBasedEval"]:
                 inputs=inputs,
                 targets=targets,
                 test_id=test_data.get("id", ""),
-                input_ids=input_ids
+                input_ids=input_ids,
             )
             test_evals.append(test_eval)
 
-        return test_evals
\ No newline at end of file
+        return test_evals