From 82f57c85f494039c77c2341af3fe848aa478e740 Mon Sep 17 00:00:00 2001 From: Paul Brown Date: Wed, 19 Mar 2025 22:05:20 +0000 Subject: [PATCH 1/5] remove unicode symbols --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4868393..cfb93e5 100644 --- a/README.md +++ b/README.md @@ -11,18 +11,18 @@ Agent Evaluation is a generative AI-powered framework for testing virtual agents Internally, Agent Evaluation implements an LLM agent (evaluator) that will orchestrate conversations with your own agent (target) and evaluate the responses during the conversation. -## ✨ Key features +## Key features - Built-in support for popular AWS services including [Amazon Bedrock](https://aws.amazon.com/bedrock/), [Amazon Q Business](https://aws.amazon.com/q/business/), and [Amazon SageMaker](https://aws.amazon.com/sagemaker/). You can also [bring your own agent](https://awslabs.github.io/agent-evaluation/targets/custom_targets/) to test using Agent Evaluation. - Orchestrate concurrent, multi-turn conversations with your agent while evaluating its responses. - Define [hooks](https://awslabs.github.io/agent-evaluation/hooks/) to perform additional tasks such as integration testing. - Can be incorporated into CI/CD pipelines to expedite the time to delivery while maintaining the stability of agents in production environments. -## 📚 Documentation +## Documentation To get started, please visit the full documentation [here](https://awslabs.github.io/agent-evaluation/). To contribute, please refer to [CONTRIBUTING.md](./CONTRIBUTING.md) -## 👏 Contributors +## Contributors Shout out to these awesome contributors: From a8e007e868b133f6df08057ae95f752cef30236e Mon Sep 17 00:00:00 2001 From: Paul Brown Date: Wed, 19 Mar 2025 22:05:50 +0000 Subject: [PATCH 2/5] switch to pathlib for cross-platform --- .../evaluators/canonical/evaluator.py | 19 ++++++++----------- src/agenteval/summary.py | 9 ++++++--- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/agenteval/evaluators/canonical/evaluator.py b/src/agenteval/evaluators/canonical/evaluator.py index bc248e3..f1fb033 100644 --- a/src/agenteval/evaluators/canonical/evaluator.py +++ b/src/agenteval/evaluators/canonical/evaluator.py @@ -13,9 +13,11 @@ ) from agenteval.test import TestResult +from pathlib import Path + logger = logging.getLogger(__name__) -_PROMPT_TEMPLATE_ROOT = "evaluators/canonical" +_PROMPT_TEMPLATE_ROOT = Path("evaluators/canonical") _SYSTEM_PROMPT_DIR = "system" _RUNTIME_PROMPT_DIR = "runtime" _PROMPT_TEMPLATE_NAMES = [ @@ -65,18 +67,13 @@ def __init__( """Initialize the evaluator.""" super().__init__(**kwargs) + system_path = _PROMPT_TEMPLATE_ROOT / _SYSTEM_PROMPT_DIR + prompt_path = _PROMPT_TEMPLATE_ROOT / _RUNTIME_PROMPT_DIR + self._prompt_template_map = { name: { - "system": jinja_env.get_template( - os.path.join( - _PROMPT_TEMPLATE_ROOT, _SYSTEM_PROMPT_DIR, f"{name}.jinja" - ) - ), - "prompt": jinja_env.get_template( - os.path.join( - _PROMPT_TEMPLATE_ROOT, _RUNTIME_PROMPT_DIR, f"{name}.jinja" - ) - ), + "system": jinja_env.get_template(Path.as_posix(system_path / f"{name}.jinja")), + "prompt": jinja_env.get_template(Path.as_posix(prompt_path / f"{name}.jinja")), } for name in _PROMPT_TEMPLATE_NAMES } diff --git a/src/agenteval/summary.py b/src/agenteval/summary.py index a334155..2862096 100644 --- a/src/agenteval/summary.py +++ b/src/agenteval/summary.py @@ -7,6 +7,8 @@ from agenteval.metrics import calculate_pass_rate_metric from agenteval.test import Test, TestResult +from pathlib import Path + _TEMPLATE_ROOT = "summary" _TEMPLATE_FILE_NAME = "agenteval_summary.md.jinja" @@ -36,8 +38,9 @@ def create_markdown_summary( Returns: None """ - template = jinja_env.get_template(os.path.join(_TEMPLATE_ROOT, _TEMPLATE_FILE_NAME)) - summary_path = os.path.join(work_dir, os.path.splitext(_TEMPLATE_FILE_NAME)[0]) + template_path = Path(_TEMPLATE_ROOT) / _TEMPLATE_FILE_NAME + template = jinja_env.get_template(Path.as_posix(template_path)) + summary_path = Path(Path(work_dir) / Path(_TEMPLATE_FILE_NAME).stem).absolute() metrics = {"pass_rate": calculate_pass_rate_metric(pass_count, num_tests)} @@ -49,5 +52,5 @@ def create_markdown_summary( def _write_summary(path: str, summary: str): - with open(path, "w+") as f: + with open(path, "w+", encoding='utf-8') as f: f.write(summary) From 7161718c0fc659ed147d7a7844640f53f0d9c9af Mon Sep 17 00:00:00 2001 From: Paul Brown Date: Wed, 19 Mar 2025 22:06:21 +0000 Subject: [PATCH 3/5] added to fix module import issue --- src/agenteval/evaluators/model_config/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/agenteval/evaluators/model_config/__init__.py diff --git a/src/agenteval/evaluators/model_config/__init__.py b/src/agenteval/evaluators/model_config/__init__.py new file mode 100644 index 0000000..e69de29 From b7e1b5ce8ed07a1afc5db93662ca5f2c25f096bb Mon Sep 17 00:00:00 2001 From: Paul Brown Date: Wed, 19 Mar 2025 22:11:32 +0000 Subject: [PATCH 4/5] fix typo --- src/agenteval/evaluators/evaluator_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agenteval/evaluators/evaluator_factory.py b/src/agenteval/evaluators/evaluator_factory.py index 25791ea..6c703a4 100644 --- a/src/agenteval/evaluators/evaluator_factory.py +++ b/src/agenteval/evaluators/evaluator_factory.py @@ -17,7 +17,7 @@ from agenteval.test import Test _EVALUATOR_METHOD_MAP = { - "canoncial": CanonicalEvaluator, + "canonical": CanonicalEvaluator, } _DEFAULT_MODEL_CONFIG_MAP = { From 304694bfb3d63c16019fde5fcba49e5540edebc2 Mon Sep 17 00:00:00 2001 From: Paul Brown Date: Thu, 20 Mar 2025 09:36:29 +0000 Subject: [PATCH 5/5] added content to allow import --- src/agenteval/evaluators/model_config/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/agenteval/evaluators/model_config/__init__.py b/src/agenteval/evaluators/model_config/__init__.py index e69de29..2d05e40 100644 --- a/src/agenteval/evaluators/model_config/__init__.py +++ b/src/agenteval/evaluators/model_config/__init__.py @@ -0,0 +1,7 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from .bedrock_model_config import ModelProvider, BedrockModelConfig +from .preconfigured_model_configs import DEFAULT_CLAUDE_3_5_MODEL_CONFIG, DEFAULT_CLAUDE_3_MODEL_CONFIG, DEFAULT_CLAUDE_HAIKU_3_5_US_MODEL_CONFIG, DEFAULT_CLAUDE_US_3_7_MODEL_CONFIG, DEFAULT_LLAMA_3_3_70B_US_MODEL_CONFIG + +__all__ = ["ModelProvider", "BedrockModelConfig", "DEFAULT_CLAUDE_3_5_MODEL_CONFIG", "DEFAULT_CLAUDE_3_MODEL_CONFIG", "DEFAULT_CLAUDE_HAIKU_3_5_US_MODEL_CONFIG", "DEFAULT_CLAUDE_US_3_7_MODEL_CONFIG", "DEFAULT_LLAMA_3_3_70B_US_MODEL_CONFIG"]