From c46d41e1cea7db238e0fdee45521a6c8b71d80ca Mon Sep 17 00:00:00 2001 From: "Keshav Ramji Keshav.Ramji@ibm.com" Date: Wed, 29 Oct 2025 21:53:13 +0000 Subject: [PATCH 1/3] v1 working --- .gitignore | 5 + cli/eval/__init__.py | 1 + cli/eval/commands.py | 40 ++++ cli/eval/runner.py | 358 +++++++++++++++++++++++++++++++ cli/m.py | 3 + mellea/__init__.py | 3 +- mellea/stdlib/test_based_eval.py | 141 ++++++++++++ 7 files changed, 550 insertions(+), 1 deletion(-) create mode 100644 cli/eval/__init__.py create mode 100644 cli/eval/commands.py create mode 100644 cli/eval/runner.py create mode 100644 mellea/stdlib/test_based_eval.py diff --git a/.gitignore b/.gitignore index 6b5814f3..f391d4a6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# KR files +kr_results/ +kr_data/ +xet/ + # Python-generated files __pycache__/ *.py[oc] diff --git a/cli/eval/__init__.py b/cli/eval/__init__.py new file mode 100644 index 00000000..7f625a26 --- /dev/null +++ b/cli/eval/__init__.py @@ -0,0 +1 @@ +"""CLI for test-based evaluation""" diff --git a/cli/eval/commands.py b/cli/eval/commands.py new file mode 100644 index 00000000..3ca7f4cc --- /dev/null +++ b/cli/eval/commands.py @@ -0,0 +1,40 @@ +import typer + +eval_app = typer.Typer(name="eval") + + +def eval_run( + test_files: list[str] = typer.Argument( + ..., help="List of paths to json/jsonl files containing test cases" + ), + backend: str = typer.Option("ollama", "--backend", "-b", help="Generation backend"), + model: str = typer.Option(None, "--model", help="Generation model name"), + judge_backend: str = typer.Option( + None, "--judge-backend", "-jb", help="Judge backend" + ), + judge_model: str = typer.Option(None, "--judge-model", help="Judge model name"), + output_path: str = typer.Option( + "eval_results", "--output-path", "-o", help="Output path for results" + ), + output_format: str = typer.Option( + "json", "--output-format", help="Either json or jsonl format for results" + ), + verbose: bool = typer.Option(False, "--verbose", "-v"), + continue_on_error: bool = typer.Option(True, "--continue-on-error"), +): + from cli.eval.runner import run_evaluations + + run_evaluations( + test_files=test_files, + backend=backend, + model=model, + judge_backend=judge_backend, + judge_model=judge_model, + output_path=output_path, + output_format=output_format, + verbose=verbose, + continue_on_error=continue_on_error, + ) + + +eval_app.command("run")(eval_run) diff --git a/cli/eval/runner.py b/cli/eval/runner.py new file mode 100644 index 00000000..7b2e7eca --- /dev/null +++ b/cli/eval/runner.py @@ -0,0 +1,358 @@ +import json +import re +from pathlib import Path +from typing import List + +import mellea +from mellea.stdlib.base import ModelOutputThunk +from mellea.stdlib.requirement import Requirement +from mellea.stdlib.test_based_eval import TestBasedEval +from mellea.backends.types import ModelOption + +from rich.console import Console +from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn +from rich.table import Table + +console = Console() + + +class InputEvalResult: + """Store results of a single input evaluation (within a unit test)""" + + def __init__( + self, + input_text: str, + model_output: str, + validation_passed: bool, + score: int, + validation_reason: str, + ): + self.input_text = input_text + self.model_output = model_output + self.validation_passed = validation_passed + self.score = score + self.validation_reason = validation_reason + + def to_dict(self): + return { + "input": self.input_text, + "model_output": self.model_output, + "passed": self.validation_passed, + "score": self.score, + "justification": self.validation_reason, + } + + +class TestEvalResult: + """Store results of a single test evaluation""" + + def __init__(self, test_eval: TestBasedEval, input_results: list[InputEvalResult]): + self.test_eval = test_eval + self.input_results = input_results + + def to_dict(self): + return { + "conversation_id": self.test_eval.conversation_id, + "category": self.test_eval.category, + "input_results": [r.to_dict() for r in self.input_results], + "expected_targets": self.test_eval.targets, + "unit_test_instructions": self.test_eval.unit_test_instructions, + "passed": self.passed_count, + "total_count": self.total_count, + "pass_rate": self.pass_rate, + "metadata": self.test_eval.metadata, + } + + @property + def passed_count(self) -> int: + return sum(1 for r in self.input_results if r.validation_passed) + + @property + def total_count(self) -> int: + return len(self.input_results) + + @property + def pass_rate(self) -> float: + return self.passed_count / self.total_count if self.total_count > 0 else 0.0 + + +def create_session(backend: str, model: str | None) -> mellea.MelleaSession: + """Create a mellea session with the specified backend and model.""" + + model_id = None + if model: + if model.isupper() or "_" in model: + if hasattr(mellea.model_ids, model): + model_id = getattr(mellea.model_ids, model) + else: + model_id = model + else: + model_id = model + else: + model_id = mellea.model_ids.IBM_GRANITE_4_MICRO_3B + + try: + backend_lower = backend.lower() + + if backend_lower == "ollama": + from mellea.backends.ollama import OllamaModelBackend + + backend_instance = OllamaModelBackend( + model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256} + ) + + elif backend_lower == "openai": + from mellea.backends.openai import OpenAIBackend + + backend_instance = OpenAIBackend( + model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256} + ) + + elif backend_lower in ["hf", "huggingface"]: + from mellea.backends.huggingface import LocalHFBackend + + backend_instance = LocalHFBackend( + model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256} + ) + + elif backend_lower == "watsonx": + from mellea.backends.watsonx import WatsonxAIBackend + + backend_instance = WatsonxAIBackend( + model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256} + ) + + elif backend_lower == "litellm": + from mellea.backends.litellm import LiteLLMBackend + + backend_instance = LiteLLMBackend( + model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256} + ) + + else: + raise ValueError( + f"Unknown backend: {backend}. Supported: ollama, openai, hf, watsonx, litellm" + ) + + # create session with backend instance + from mellea.stdlib.base import SimpleContext + + session = mellea.MelleaSession( + backend=backend_instance, ctx=SimpleContext() + ) # need to reset to SimpleContext? print what is being judged by the judge (input) + return session + + except Exception as e: + console.print( + f"[red]Error creating session with backend={backend}, model={model_id}: {e}[/red]" + ) + raise + + +def run_evaluations( + test_files: List[str], + backend: str, + model: str | None, + judge_backend: str | None, + judge_model: str | None, + output_path: str, + output_format: str, + verbose: bool, + continue_on_error: bool, +): + """Run all 'unit test' evaluations""" + all_test_evals: List[TestBasedEval] = [] + + for test_file in test_files: + try: + test_evals = TestBasedEval.from_json_file(test_file) + all_test_evals.extend(test_evals) + console.print(f"Loaded {len(test_evals)} test evaluations from {test_file}") + except Exception as e: + console.print(f"Error loading {test_file}") + + if not all_test_evals: + console.print("Failed to load any test evaluations") + return + + console.print(f"Total test evals to run: {len(all_test_evals)}") + + console.print(f"Generation model: {model}") + console.print(f"Judge model: {judge_model}") + + m = create_session(backend=backend, model=model) + judge_session = create_session(backend=judge_backend, model=judge_model) + + all_results = [] + + # some visuals on progress with rich, we can take out / modify + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + console=console, + ) as progress: + task = progress.add_task("Running evals", total=len(all_test_evals)) + for test_eval in all_test_evals: + try: + result = execute_test_eval( + test_eval=test_eval, + generation_session=m, + judge_session=judge_session, + ) + all_results.append(result) + except Exception as e: + console.print(f"Error {e} on test {test_eval.conversation_id}") + if not continue_on_error: + raise + + progress.advance(task) + + summary_stats(all_results) + save_results(all_results, output_path, output_format) + + m.cleanup() + judge_session.cleanup() + + +def execute_test_eval( + test_eval: TestBasedEval, + generation_session: mellea.MelleaSession, + judge_session: mellea.MelleaSession, +) -> TestEvalResult: + """Execute a single test evaluation + For each input in the test, generate a response using generation_session + Then, after all inputs are processed, validate using judge_session + """ + + input_results = [] + + # for all inputs, generate responses with generator + for input_text in test_eval.inputs: + result: ModelOutputThunk = generation_session.act(input_text) + model_output = str(result) + console.print(model_output) + + judge_session.ctx = judge_session.ctx.add(result) + + requirement = Requirement( + description=create_judge_requirement(test_eval, input_text, model_output) + ) + validation_results = judge_session.validate(requirement) + validation_result = validation_results[0] + + judge_output = validation_result.reason or "" + score, justification = parse_judge_output(judge_output) + + passed = score == 1 if score is not None else validation_result.as_bool() + + input_result = InputEvalResult( + input_text=input_text, + model_output=model_output, + validation_passed=passed, + score=score, + validation_reason=justification, + ) + input_results.append(input_result) + + # reset both generator and judge -- might not be necessary since SimpleContext doesn't retain history + generation_session.reset() + judge_session.reset() + + test_result = TestEvalResult(test_eval=test_eval, input_results=input_results) + return test_result + + +def create_judge_requirement( + test_eval: TestBasedEval, input_text: str, model_output: str +): + """Create judge requirement description""" + + if len(test_eval.targets) == 0: # no reference + target_text = "N/A" # another way to handle this? + elif len(test_eval.targets) == 1: + target_text = test_eval.targets[0] + else: # enumerate the multiple targets + target_text = "\n".join( + [f"{i}. {target}" for i, target in enumerate(test_eval.targets, 1)] + ) + + judge_prompt = test_eval.judge_prompt.format( + input=input_text, + prediction=model_output, + target=target_text, + guidelines=test_eval.unit_test_instructions, + ) + + return judge_prompt + + +def parse_judge_output(judge_output: str): + try: + json_match = re.search(r'\{[^}]*"score"[^}]*\}', judge_output, re.DOTALL) + if json_match: + json_str = json_match.group(0) + data = json.loads(json_str) + score = data.get("score") + justification = data.get("justification") + return score, justification + except (json.JSONDecodeError, AttributeError): + pass + + # if the above fails, search the text for the score + score_match = re.search(r'score["\s:]+(\d+)', judge_output, re.IGNORECASE) + if score_match: + score = int(score_match.group(1)) + return score, judge_output + + return None, judge_output + + +def save_results(results: List[TestEvalResult], output_path: str, output_format: str): + output_path_obj = Path(output_path) + if output_path_obj.suffix != f".{output_format}": + output_path_obj = Path(f"{output_path}.{output_format}") + + total_inputs = sum(r.total_count for r in results) + passed_inputs = sum(r.passed_count for r in results) + overall_pass_rate = passed_inputs / total_inputs if total_inputs > 0 else 0.0 + + if output_format == "jsonl": + with output_path_obj.open("w") as f: + for result in results: + f.write(json.dumps(result.to_dict()) + "\n") + else: # json + summary = { + "total_unit_tests": len(results), + "total_inputs": total_inputs, + "passed_inputs": passed_inputs, + "failed_inputs": total_inputs - passed_inputs, + "overall_pass_rate": overall_pass_rate, + } + + with output_path_obj.open("w") as f: + json.dump( + {"summary": summary, "results": [r.to_dict() for r in results]}, + f, + indent=2, + ) + + console.print(f"Results saved to {output_path}") + + +def summary_stats(results: List[TestEvalResult]): + total_inputs = sum(r.total_count for r in results) + passed_inputs = sum(r.passed_count for r in results) + overall_pass_rate = passed_inputs / total_inputs if total_inputs > 0 else 0.0 + + console.print(f"Total number of inputs across tests: {total_inputs}") + console.print(f"Number of inputs passed across tests: {passed_inputs}") + console.print(f"Cumulative Pass Rate: {overall_pass_rate}") + + if len(results) > 1: + console.print("Per-Test Breakdown:") + for result in results: + console.print( + f"{result.test_eval.conversation_id}:\n\t{result.passed_count}/{result.total_count} ({result.pass_rate * 100:.1f}%)\n\n" + ) diff --git a/cli/m.py b/cli/m.py index 3aa32aa1..07fc14b9 100644 --- a/cli/m.py +++ b/cli/m.py @@ -5,6 +5,7 @@ from cli.alora.commands import alora_app from cli.decompose import app as decompose_app from cli.serve.app import serve +from cli.eval.commands import eval_app cli = typer.Typer(name="m", no_args_is_help=True) @@ -25,3 +26,5 @@ def callback() -> None: # as documented: https://typer.tiangolo.com/tutorial/subcommands/add-typer/#put-them-together. cli.add_typer(alora_app) cli.add_typer(decompose_app) + +cli.add_typer(eval_app) diff --git a/mellea/__init__.py b/mellea/__init__.py index a8fc24fa..d100f4d9 100644 --- a/mellea/__init__.py +++ b/mellea/__init__.py @@ -3,5 +3,6 @@ import mellea.backends.model_ids as model_ids from mellea.stdlib.genslot import generative from mellea.stdlib.session import MelleaSession, start_session +from mellea.stdlib.test_based_eval import TestBasedEval -__all__ = ["MelleaSession", "generative", "model_ids", "start_session"] +__all__ = ["MelleaSession", "TestBasedEval", "generative", "model_ids", "start_session"] diff --git a/mellea/stdlib/test_based_eval.py b/mellea/stdlib/test_based_eval.py new file mode 100644 index 00000000..fb361e13 --- /dev/null +++ b/mellea/stdlib/test_based_eval.py @@ -0,0 +1,141 @@ +"""LLM Evaluation with Unit Tests in Mellea.""" + +import json +from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from mellea.stdlib.base import CBlock, Component, TemplateRepresentation + + +class TestBasedEval(Component): + """Each TestBasedEval represents a single unit test.""" + + def __init__( + self, + category: str, + judge_prompt: str, + inputs: list[str], + unit_test_instructions: str, + targets: list[str] | None = None, # can be optional + conversation_id: str | None = None, # in case we change format later + metadata: dict[str, Any] | None = None, # for anything miscellaneous + ): + """Initialize TestBasedEval (for a single unit test).""" + self.category = category + self.judge_prompt = judge_prompt + self.inputs = inputs + self.unit_test_instructions = unit_test_instructions + self.targets = targets or [] + self.conversation_id = conversation_id + self.metadata = metadata or {} + + def format_for_llm(self) -> TemplateRepresentation | str: + """Format for use by judge LLM session.""" + return self.category + "\n\n" + self.judge_prompt + + # if making a new jinja template + # return TemplateRepresentation( + # obj=self, + # args={ + # "category": self.category, + # "judge_prompt": self.judge_prompt, + # "inputs": self.inputs, + # "unit_test_instructions": self.unit_test_instructions, + # "targets": self.targets, + # }, + # #template_order=["*"] + # ) + + @classmethod + def from_json_file(cls, filepath: str) -> list["TestBasedEval"]: + """Load test evaluations from json/jsonl file, return list of TestBasedEval instances, one per 'unit test'.""" + path = Path(filepath) + test_data = [] + + if path.suffix == ".jsonl": + with path.open("r") as f: + for line in f: + if line.strip(): + data = json.loads(line) + if not data.get("skip", False): + test_data.append(data) + else: # '.json' + with path.open("r") as f: + data = json.load(f) + if isinstance(data, list): + test_data = [item for item in data if not item.get("skip", False)] + + # group by conversation_id -- a single 'unit test' + grouped = defaultdict(list) + for item in test_data: + conversation_id = item.get("conversation_id") + grouped[conversation_id].append(item) + + test_evals = [] + for conversation_id, items in grouped.items(): + first_item = items[0] + ut_instructions = first_item.get("unit test instructions", "") + category = first_item.get("dataset", path.stem) + + inputs = [] + for item in items: + for turn in item.get("input", []): + if turn.get("speaker") == "user": + inputs.append(turn.get("text")) + + targets = [] + for item in items: + for turn in item.get("targets", []): + if turn.get("speaker") == "agent": + targets.append(turn.get("text")) + + # figure out how to add judge prompt template here + judge_prompt = """**Input to the model** + + {input} + + **Model output to be rated** + + {prediction} + + **Ground truth text** + + {target} + + **Rating Guidelines** + The model output should adhere to the following guidelines: + {guidelines} + + **Scoring Criteria** + * Score 0: The model output violates any of the guidelines. + * Score 1: The model output is well aligned with the ground truth - if it exists, the input to the model, and adheres to all guidelines. + + **Return Your Rating** + Return your rating in the following format: + {{\"score\": your_score, \"justification\": \"your_justification\"}} + + Your rating: + """ + + metadata = { + "task_type": first_item.get( + "task_type" + ), # for displaying results maybe? + "dataset": first_item.get("dataset"), + "items": items, # keep list of original items for reference + } + + test_eval = cls( + category=category, + judge_prompt=judge_prompt, + inputs=inputs, + unit_test_instructions=ut_instructions, + targets=targets, + conversation_id=conversation_id, + metadata=metadata, + ) + test_evals.append(test_eval) + + return test_evals From 21f7960db42175a4f2542867cb73af68bf164258 Mon Sep 17 00:00:00 2001 From: "Keshav Ramji Keshav.Ramji@ibm.com" Date: Thu, 6 Nov 2025 16:33:27 +0000 Subject: [PATCH 2/3] Update v1 data format and judge call --- .gitignore | 2 + cli/eval/commands.py | 4 + cli/eval/runner.py | 78 +++++++++-------- mellea/stdlib/test_based_eval.py | 138 +++++++++++-------------------- 4 files changed, 93 insertions(+), 129 deletions(-) diff --git a/.gitignore b/.gitignore index f391d4a6..9249c799 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ kr_results/ kr_data/ xet/ +job.sh +hub/ # Python-generated files __pycache__/ diff --git a/cli/eval/commands.py b/cli/eval/commands.py index 3ca7f4cc..65d8e817 100644 --- a/cli/eval/commands.py +++ b/cli/eval/commands.py @@ -9,10 +9,12 @@ def eval_run( ), backend: str = typer.Option("ollama", "--backend", "-b", help="Generation backend"), model: str = typer.Option(None, "--model", help="Generation model name"), + max_gen_tokens: int = typer.Option(256, "--max-gen-tokens", help="Max tokens to generate for responses"), judge_backend: str = typer.Option( None, "--judge-backend", "-jb", help="Judge backend" ), judge_model: str = typer.Option(None, "--judge-model", help="Judge model name"), + max_judge_tokens: int = typer.Option(256, "--max-judge-tokens", help="Max tokens for the judge model's judgement."), output_path: str = typer.Option( "eval_results", "--output-path", "-o", help="Output path for results" ), @@ -28,8 +30,10 @@ def eval_run( test_files=test_files, backend=backend, model=model, + max_gen_tokens=max_gen_tokens, judge_backend=judge_backend, judge_model=judge_model, + max_judge_tokens=max_judge_tokens, output_path=output_path, output_format=output_format, verbose=verbose, diff --git a/cli/eval/runner.py b/cli/eval/runner.py index 7b2e7eca..2d24ae80 100644 --- a/cli/eval/runner.py +++ b/cli/eval/runner.py @@ -5,13 +5,11 @@ import mellea from mellea.stdlib.base import ModelOutputThunk -from mellea.stdlib.requirement import Requirement from mellea.stdlib.test_based_eval import TestBasedEval from mellea.backends.types import ModelOption from rich.console import Console from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn -from rich.table import Table console = Console() @@ -25,7 +23,7 @@ def __init__( model_output: str, validation_passed: bool, score: int, - validation_reason: str, + validation_reason: str, # add input_id ): self.input_text = input_text self.model_output = model_output @@ -52,15 +50,15 @@ def __init__(self, test_eval: TestBasedEval, input_results: list[InputEvalResult def to_dict(self): return { - "conversation_id": self.test_eval.conversation_id, - "category": self.test_eval.category, + "test_id": self.test_eval.test_id, + "source": self.test_eval.source, + "name": self.test_eval.name, + "instructions": self.test_eval.instructions, "input_results": [r.to_dict() for r in self.input_results], "expected_targets": self.test_eval.targets, - "unit_test_instructions": self.test_eval.unit_test_instructions, "passed": self.passed_count, "total_count": self.total_count, "pass_rate": self.pass_rate, - "metadata": self.test_eval.metadata, } @property @@ -76,7 +74,7 @@ def pass_rate(self) -> float: return self.passed_count / self.total_count if self.total_count > 0 else 0.0 -def create_session(backend: str, model: str | None) -> mellea.MelleaSession: +def create_session(backend: str, model: str | None, max_tokens: int | None) -> mellea.MelleaSession: """Create a mellea session with the specified backend and model.""" model_id = None @@ -98,35 +96,35 @@ def create_session(backend: str, model: str | None) -> mellea.MelleaSession: from mellea.backends.ollama import OllamaModelBackend backend_instance = OllamaModelBackend( - model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256} + model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens} ) elif backend_lower == "openai": from mellea.backends.openai import OpenAIBackend backend_instance = OpenAIBackend( - model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256} + model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens} ) elif backend_lower in ["hf", "huggingface"]: from mellea.backends.huggingface import LocalHFBackend backend_instance = LocalHFBackend( - model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256} + model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}, ) elif backend_lower == "watsonx": from mellea.backends.watsonx import WatsonxAIBackend backend_instance = WatsonxAIBackend( - model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256} + model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens} ) elif backend_lower == "litellm": from mellea.backends.litellm import LiteLLMBackend backend_instance = LiteLLMBackend( - model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: 256} + model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens} ) else: @@ -139,7 +137,7 @@ def create_session(backend: str, model: str | None) -> mellea.MelleaSession: session = mellea.MelleaSession( backend=backend_instance, ctx=SimpleContext() - ) # need to reset to SimpleContext? print what is being judged by the judge (input) + ) return session except Exception as e: @@ -153,8 +151,10 @@ def run_evaluations( test_files: List[str], backend: str, model: str | None, + max_gen_tokens: int | None, judge_backend: str | None, judge_model: str | None, + max_judge_tokens: int | None, output_path: str, output_format: str, verbose: bool, @@ -176,16 +176,17 @@ def run_evaluations( return console.print(f"Total test evals to run: {len(all_test_evals)}") + total_inputs = sum(len(te.inputs) for te in all_test_evals) + console.print(f"Total inputs to run: {total_inputs}") console.print(f"Generation model: {model}") console.print(f"Judge model: {judge_model}") - m = create_session(backend=backend, model=model) - judge_session = create_session(backend=judge_backend, model=judge_model) + m = create_session(backend=backend, model=model, max_tokens=max_gen_tokens) + judge_session = create_session(backend=judge_backend, model=judge_model, max_tokens=max_judge_tokens) all_results = [] - # some visuals on progress with rich, we can take out / modify with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), @@ -203,7 +204,7 @@ def run_evaluations( ) all_results.append(result) except Exception as e: - console.print(f"Error {e} on test {test_eval.conversation_id}") + console.print(f"Error {e} on test {test_eval.test_id}") if not continue_on_error: raise @@ -229,23 +230,20 @@ def execute_test_eval( input_results = [] # for all inputs, generate responses with generator - for input_text in test_eval.inputs: + for idx, input_text in enumerate(test_eval.inputs): result: ModelOutputThunk = generation_session.act(input_text) model_output = str(result) - console.print(model_output) judge_session.ctx = judge_session.ctx.add(result) - requirement = Requirement( - description=create_judge_requirement(test_eval, input_text, model_output) - ) - validation_results = judge_session.validate(requirement) - validation_result = validation_results[0] + targets_for_input = (test_eval.targets[idx] if idx < len(test_eval.targets) else []) - judge_output = validation_result.reason or "" + # query the judge + judge_prompt = create_judge_requirement(test_eval, input_text, model_output, targets_for_input) + judge_output_thunk = judge_session.act(judge_prompt) + judge_output = str(judge_output_thunk) score, justification = parse_judge_output(judge_output) - - passed = score == 1 if score is not None else validation_result.as_bool() + passed = score == 1 if score is not None else False input_result = InputEvalResult( input_text=input_text, @@ -256,7 +254,7 @@ def execute_test_eval( ) input_results.append(input_result) - # reset both generator and judge -- might not be necessary since SimpleContext doesn't retain history + # reset both generator and judge generation_session.reset() judge_session.reset() @@ -265,24 +263,24 @@ def execute_test_eval( def create_judge_requirement( - test_eval: TestBasedEval, input_text: str, model_output: str + test_eval: TestBasedEval, input_text: str, model_output: str, targets_for_input: list[str] ): """Create judge requirement description""" - if len(test_eval.targets) == 0: # no reference - target_text = "N/A" # another way to handle this? - elif len(test_eval.targets) == 1: - target_text = test_eval.targets[0] - else: # enumerate the multiple targets + if len(targets_for_input) == 0: # no reference + target_text = "N/A" + elif len(targets_for_input) == 1: + target_text = targets_for_input[0] + else: # enumerate when there are multiple targets target_text = "\n".join( - [f"{i}. {target}" for i, target in enumerate(test_eval.targets, 1)] + [f"{i}. {target}" for i, target in enumerate(targets_for_input, 1)] ) judge_prompt = test_eval.judge_prompt.format( input=input_text, prediction=model_output, target=target_text, - guidelines=test_eval.unit_test_instructions, + guidelines=test_eval.instructions, ) return judge_prompt @@ -324,7 +322,7 @@ def save_results(results: List[TestEvalResult], output_path: str, output_format: f.write(json.dumps(result.to_dict()) + "\n") else: # json summary = { - "total_unit_tests": len(results), + "total_tests": len(results), "total_inputs": total_inputs, "passed_inputs": passed_inputs, "failed_inputs": total_inputs - passed_inputs, @@ -348,11 +346,11 @@ def summary_stats(results: List[TestEvalResult]): console.print(f"Total number of inputs across tests: {total_inputs}") console.print(f"Number of inputs passed across tests: {passed_inputs}") - console.print(f"Cumulative Pass Rate: {overall_pass_rate}") + console.print(f"Cumulative Pass Rate: {overall_pass_rate * 100:.1f}%") if len(results) > 1: console.print("Per-Test Breakdown:") for result in results: console.print( - f"{result.test_eval.conversation_id}:\n\t{result.passed_count}/{result.total_count} ({result.pass_rate * 100:.1f}%)\n\n" + f"{result.test_eval.name}:\n\t{result.passed_count}/{result.total_count} ({result.pass_rate * 100:.1f}%)\n\n" ) diff --git a/mellea/stdlib/test_based_eval.py b/mellea/stdlib/test_based_eval.py index fb361e13..8d740b6d 100644 --- a/mellea/stdlib/test_based_eval.py +++ b/mellea/stdlib/test_based_eval.py @@ -1,12 +1,10 @@ """LLM Evaluation with Unit Tests in Mellea.""" import json -from collections import defaultdict -from dataclasses import dataclass from pathlib import Path from typing import Any -from mellea.stdlib.base import CBlock, Component, TemplateRepresentation +from mellea.stdlib.base import Component class TestBasedEval(Component): @@ -14,85 +12,24 @@ class TestBasedEval(Component): def __init__( self, - category: str, - judge_prompt: str, + source: str, + name: str, + instructions: str, inputs: list[str], - unit_test_instructions: str, - targets: list[str] | None = None, # can be optional - conversation_id: str | None = None, # in case we change format later - metadata: dict[str, Any] | None = None, # for anything miscellaneous + targets: list[list[str]] | None = None, # can be optional + test_id: str | None = None, + input_ids: list[str] | None = None ): """Initialize TestBasedEval (for a single unit test).""" - self.category = category - self.judge_prompt = judge_prompt + self.source = source + self.name = name + self.instructions = instructions self.inputs = inputs - self.unit_test_instructions = unit_test_instructions self.targets = targets or [] - self.conversation_id = conversation_id - self.metadata = metadata or {} - - def format_for_llm(self) -> TemplateRepresentation | str: - """Format for use by judge LLM session.""" - return self.category + "\n\n" + self.judge_prompt - - # if making a new jinja template - # return TemplateRepresentation( - # obj=self, - # args={ - # "category": self.category, - # "judge_prompt": self.judge_prompt, - # "inputs": self.inputs, - # "unit_test_instructions": self.unit_test_instructions, - # "targets": self.targets, - # }, - # #template_order=["*"] - # ) + self.test_id = test_id + self.input_ids = input_ids or [] - @classmethod - def from_json_file(cls, filepath: str) -> list["TestBasedEval"]: - """Load test evaluations from json/jsonl file, return list of TestBasedEval instances, one per 'unit test'.""" - path = Path(filepath) - test_data = [] - - if path.suffix == ".jsonl": - with path.open("r") as f: - for line in f: - if line.strip(): - data = json.loads(line) - if not data.get("skip", False): - test_data.append(data) - else: # '.json' - with path.open("r") as f: - data = json.load(f) - if isinstance(data, list): - test_data = [item for item in data if not item.get("skip", False)] - - # group by conversation_id -- a single 'unit test' - grouped = defaultdict(list) - for item in test_data: - conversation_id = item.get("conversation_id") - grouped[conversation_id].append(item) - - test_evals = [] - for conversation_id, items in grouped.items(): - first_item = items[0] - ut_instructions = first_item.get("unit test instructions", "") - category = first_item.get("dataset", path.stem) - - inputs = [] - for item in items: - for turn in item.get("input", []): - if turn.get("speaker") == "user": - inputs.append(turn.get("text")) - - targets = [] - for item in items: - for turn in item.get("targets", []): - if turn.get("speaker") == "agent": - targets.append(turn.get("text")) - - # figure out how to add judge prompt template here - judge_prompt = """**Input to the model** + self.judge_prompt = """**Input to the model** {input} @@ -119,23 +56,46 @@ def from_json_file(cls, filepath: str) -> list["TestBasedEval"]: Your rating: """ - metadata = { - "task_type": first_item.get( - "task_type" - ), # for displaying results maybe? - "dataset": first_item.get("dataset"), - "items": items, # keep list of original items for reference - } + @classmethod + def from_json_file(cls, filepath: str) -> list["TestBasedEval"]: + """Load test evaluations from json/jsonl file, return list of TestBasedEval instances, one per 'unit test'.""" + path = Path(filepath) + + with path.open('r') as f: + data = json.load(f) + + if not isinstance(data, list): + data = [data] + + test_evals = [] + for test_data in data: + examples = test_data.get("examples", []) + + inputs = [] + targets = [] + input_ids = [] + + for example in examples: + input_messages = example.get("input", []) + user_messages = [msg for msg in input_messages if msg.get("role") == "user"] + if user_messages: + inputs.append(user_messages[-1].get("content", "")) + + target_messages = example.get("targets", []) + targets_for_input = [msg.get("content", "") for msg in target_messages if msg.get("role") == "assistant"] + targets.append(targets_for_input) + + input_ids.append(example.get("input_id", "")) test_eval = cls( - category=category, - judge_prompt=judge_prompt, + source=test_data.get("source", "unknown"), + name=test_data.get("name", ""), + instructions=test_data.get("instructions", ""), inputs=inputs, - unit_test_instructions=ut_instructions, targets=targets, - conversation_id=conversation_id, - metadata=metadata, + test_id=test_data.get("id", ""), + input_ids=input_ids ) test_evals.append(test_eval) - return test_evals + return test_evals \ No newline at end of file From 7919b5bf7896af457e3c9ce0f85a22b7ea55bd2e Mon Sep 17 00:00:00 2001 From: "Keshav Ramji Keshav.Ramji@ibm.com" Date: Thu, 6 Nov 2025 17:29:20 +0000 Subject: [PATCH 3/3] Pre-commit fixes --- cli/eval/commands.py | 10 ++++--- cli/eval/runner.py | 47 ++++++++++++++++++++------------ mellea/stdlib/reqlib/md.py | 13 +++++++-- mellea/stdlib/test_based_eval.py | 18 ++++++++---- 4 files changed, 57 insertions(+), 31 deletions(-) diff --git a/cli/eval/commands.py b/cli/eval/commands.py index 65d8e817..e4ddea6c 100644 --- a/cli/eval/commands.py +++ b/cli/eval/commands.py @@ -9,19 +9,22 @@ def eval_run( ), backend: str = typer.Option("ollama", "--backend", "-b", help="Generation backend"), model: str = typer.Option(None, "--model", help="Generation model name"), - max_gen_tokens: int = typer.Option(256, "--max-gen-tokens", help="Max tokens to generate for responses"), + max_gen_tokens: int = typer.Option( + 256, "--max-gen-tokens", help="Max tokens to generate for responses" + ), judge_backend: str = typer.Option( None, "--judge-backend", "-jb", help="Judge backend" ), judge_model: str = typer.Option(None, "--judge-model", help="Judge model name"), - max_judge_tokens: int = typer.Option(256, "--max-judge-tokens", help="Max tokens for the judge model's judgement."), + max_judge_tokens: int = typer.Option( + 256, "--max-judge-tokens", help="Max tokens for the judge model's judgement." + ), output_path: str = typer.Option( "eval_results", "--output-path", "-o", help="Output path for results" ), output_format: str = typer.Option( "json", "--output-format", help="Either json or jsonl format for results" ), - verbose: bool = typer.Option(False, "--verbose", "-v"), continue_on_error: bool = typer.Option(True, "--continue-on-error"), ): from cli.eval.runner import run_evaluations @@ -36,7 +39,6 @@ def eval_run( max_judge_tokens=max_judge_tokens, output_path=output_path, output_format=output_format, - verbose=verbose, continue_on_error=continue_on_error, ) diff --git a/cli/eval/runner.py b/cli/eval/runner.py index 2d24ae80..e320e0d0 100644 --- a/cli/eval/runner.py +++ b/cli/eval/runner.py @@ -23,7 +23,7 @@ def __init__( model_output: str, validation_passed: bool, score: int, - validation_reason: str, # add input_id + validation_reason: str, # add input_id ): self.input_text = input_text self.model_output = model_output @@ -74,7 +74,9 @@ def pass_rate(self) -> float: return self.passed_count / self.total_count if self.total_count > 0 else 0.0 -def create_session(backend: str, model: str | None, max_tokens: int | None) -> mellea.MelleaSession: +def create_session( + backend: str, model: str | None, max_tokens: int | None +) -> mellea.MelleaSession: """Create a mellea session with the specified backend and model.""" model_id = None @@ -96,35 +98,40 @@ def create_session(backend: str, model: str | None, max_tokens: int | None) -> m from mellea.backends.ollama import OllamaModelBackend backend_instance = OllamaModelBackend( - model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens} + model_id=model_id, + model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}, ) elif backend_lower == "openai": from mellea.backends.openai import OpenAIBackend backend_instance = OpenAIBackend( - model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens} + model_id=model_id, + model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}, ) elif backend_lower in ["hf", "huggingface"]: from mellea.backends.huggingface import LocalHFBackend backend_instance = LocalHFBackend( - model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}, + model_id=model_id, + model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}, ) elif backend_lower == "watsonx": from mellea.backends.watsonx import WatsonxAIBackend backend_instance = WatsonxAIBackend( - model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens} + model_id=model_id, + model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}, ) elif backend_lower == "litellm": from mellea.backends.litellm import LiteLLMBackend backend_instance = LiteLLMBackend( - model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens} + model_id=model_id, + model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}, ) else: @@ -135,9 +142,7 @@ def create_session(backend: str, model: str | None, max_tokens: int | None) -> m # create session with backend instance from mellea.stdlib.base import SimpleContext - session = mellea.MelleaSession( - backend=backend_instance, ctx=SimpleContext() - ) + session = mellea.MelleaSession(backend=backend_instance, ctx=SimpleContext()) return session except Exception as e: @@ -157,7 +162,6 @@ def run_evaluations( max_judge_tokens: int | None, output_path: str, output_format: str, - verbose: bool, continue_on_error: bool, ): """Run all 'unit test' evaluations""" @@ -176,14 +180,16 @@ def run_evaluations( return console.print(f"Total test evals to run: {len(all_test_evals)}") - total_inputs = sum(len(te.inputs) for te in all_test_evals) + total_inputs = sum(len(test_eval.inputs) for test_eval in all_test_evals) console.print(f"Total inputs to run: {total_inputs}") console.print(f"Generation model: {model}") console.print(f"Judge model: {judge_model}") m = create_session(backend=backend, model=model, max_tokens=max_gen_tokens) - judge_session = create_session(backend=judge_backend, model=judge_model, max_tokens=max_judge_tokens) + judge_session = create_session( + backend=judge_backend, model=judge_model, max_tokens=max_judge_tokens + ) all_results = [] @@ -234,12 +240,14 @@ def execute_test_eval( result: ModelOutputThunk = generation_session.act(input_text) model_output = str(result) - judge_session.ctx = judge_session.ctx.add(result) - - targets_for_input = (test_eval.targets[idx] if idx < len(test_eval.targets) else []) + targets_for_input = ( + test_eval.targets[idx] if idx < len(test_eval.targets) else [] + ) # query the judge - judge_prompt = create_judge_requirement(test_eval, input_text, model_output, targets_for_input) + judge_prompt = create_judge_requirement( + test_eval, input_text, model_output, targets_for_input + ) judge_output_thunk = judge_session.act(judge_prompt) judge_output = str(judge_output_thunk) score, justification = parse_judge_output(judge_output) @@ -263,7 +271,10 @@ def execute_test_eval( def create_judge_requirement( - test_eval: TestBasedEval, input_text: str, model_output: str, targets_for_input: list[str] + test_eval: TestBasedEval, + input_text: str, + model_output: str, + targets_for_input: list[str], ): """Create judge requirement description""" diff --git a/mellea/stdlib/reqlib/md.py b/mellea/stdlib/reqlib/md.py index 3cee2770..9a1836ed 100644 --- a/mellea/stdlib/reqlib/md.py +++ b/mellea/stdlib/reqlib/md.py @@ -14,10 +14,14 @@ def as_markdown_list(ctx: Context) -> list[str] | None: raw_output = ctx.last_output() assert raw_output is not None try: + assert raw_output.value is not None parsed = mistletoe.Document(raw_output.value) - for child in parsed.children: + assert parsed.children is not None + children = list(parsed.children) + for child in children: if type(child) is not mistletoe.block_token.List: return None + assert child.children is not None for item in child.children: xs.append(mistletoe.base_renderer.BaseRenderer().render(item)) return xs @@ -44,10 +48,13 @@ def _md_table(ctx: Context): raw_output = ctx.last_output() assert raw_output is not None try: + assert raw_output.value is not None parsed = mistletoe.Document(raw_output.value) - if len(parsed.children) != 1: + assert parsed.children is not None + children = list(parsed.children) + if len(children) != 1: return False - return type(parsed.children[0]) is mistletoe.block_token.Table + return type(children[0]) is mistletoe.block_token.Table except Exception: return False diff --git a/mellea/stdlib/test_based_eval.py b/mellea/stdlib/test_based_eval.py index 8d740b6d..a51ae414 100644 --- a/mellea/stdlib/test_based_eval.py +++ b/mellea/stdlib/test_based_eval.py @@ -18,7 +18,7 @@ def __init__( inputs: list[str], targets: list[list[str]] | None = None, # can be optional test_id: str | None = None, - input_ids: list[str] | None = None + input_ids: list[str] | None = None, ): """Initialize TestBasedEval (for a single unit test).""" self.source = source @@ -61,7 +61,7 @@ def from_json_file(cls, filepath: str) -> list["TestBasedEval"]: """Load test evaluations from json/jsonl file, return list of TestBasedEval instances, one per 'unit test'.""" path = Path(filepath) - with path.open('r') as f: + with path.open("r") as f: data = json.load(f) if not isinstance(data, list): @@ -77,12 +77,18 @@ def from_json_file(cls, filepath: str) -> list["TestBasedEval"]: for example in examples: input_messages = example.get("input", []) - user_messages = [msg for msg in input_messages if msg.get("role") == "user"] + user_messages = [ + msg for msg in input_messages if msg.get("role") == "user" + ] if user_messages: inputs.append(user_messages[-1].get("content", "")) target_messages = example.get("targets", []) - targets_for_input = [msg.get("content", "") for msg in target_messages if msg.get("role") == "assistant"] + targets_for_input = [ + msg.get("content", "") + for msg in target_messages + if msg.get("role") == "assistant" + ] targets.append(targets_for_input) input_ids.append(example.get("input_id", "")) @@ -94,8 +100,8 @@ def from_json_file(cls, filepath: str) -> list["TestBasedEval"]: inputs=inputs, targets=targets, test_id=test_data.get("id", ""), - input_ids=input_ids + input_ids=input_ids, ) test_evals.append(test_eval) - return test_evals \ No newline at end of file + return test_evals