diff --git a/README.md b/README.md index 4868393..cfb93e5 100644 --- a/README.md +++ b/README.md @@ -11,18 +11,18 @@ Agent Evaluation is a generative AI-powered framework for testing virtual agents Internally, Agent Evaluation implements an LLM agent (evaluator) that will orchestrate conversations with your own agent (target) and evaluate the responses during the conversation. -## ✨ Key features +## Key features - Built-in support for popular AWS services including [Amazon Bedrock](https://aws.amazon.com/bedrock/), [Amazon Q Business](https://aws.amazon.com/q/business/), and [Amazon SageMaker](https://aws.amazon.com/sagemaker/). You can also [bring your own agent](https://awslabs.github.io/agent-evaluation/targets/custom_targets/) to test using Agent Evaluation. - Orchestrate concurrent, multi-turn conversations with your agent while evaluating its responses. - Define [hooks](https://awslabs.github.io/agent-evaluation/hooks/) to perform additional tasks such as integration testing. - Can be incorporated into CI/CD pipelines to expedite the time to delivery while maintaining the stability of agents in production environments. -## 📚 Documentation +## Documentation To get started, please visit the full documentation [here](https://awslabs.github.io/agent-evaluation/). To contribute, please refer to [CONTRIBUTING.md](./CONTRIBUTING.md) -## 👏 Contributors +## Contributors Shout out to these awesome contributors: diff --git a/src/agenteval/evaluators/canonical/evaluator.py b/src/agenteval/evaluators/canonical/evaluator.py index bc248e3..f1fb033 100644 --- a/src/agenteval/evaluators/canonical/evaluator.py +++ b/src/agenteval/evaluators/canonical/evaluator.py @@ -13,9 +13,11 @@ ) from agenteval.test import TestResult +from pathlib import Path + logger = logging.getLogger(__name__) -_PROMPT_TEMPLATE_ROOT = "evaluators/canonical" +_PROMPT_TEMPLATE_ROOT = Path("evaluators/canonical") _SYSTEM_PROMPT_DIR = "system" _RUNTIME_PROMPT_DIR = "runtime" _PROMPT_TEMPLATE_NAMES = [ @@ -65,18 +67,13 @@ def __init__( """Initialize the evaluator.""" super().__init__(**kwargs) + system_path = _PROMPT_TEMPLATE_ROOT / _SYSTEM_PROMPT_DIR + prompt_path = _PROMPT_TEMPLATE_ROOT / _RUNTIME_PROMPT_DIR + self._prompt_template_map = { name: { - "system": jinja_env.get_template( - os.path.join( - _PROMPT_TEMPLATE_ROOT, _SYSTEM_PROMPT_DIR, f"{name}.jinja" - ) - ), - "prompt": jinja_env.get_template( - os.path.join( - _PROMPT_TEMPLATE_ROOT, _RUNTIME_PROMPT_DIR, f"{name}.jinja" - ) - ), + "system": jinja_env.get_template(Path.as_posix(system_path / f"{name}.jinja")), + "prompt": jinja_env.get_template(Path.as_posix(prompt_path / f"{name}.jinja")), } for name in _PROMPT_TEMPLATE_NAMES } diff --git a/src/agenteval/evaluators/evaluator_factory.py b/src/agenteval/evaluators/evaluator_factory.py index 25791ea..6c703a4 100644 --- a/src/agenteval/evaluators/evaluator_factory.py +++ b/src/agenteval/evaluators/evaluator_factory.py @@ -17,7 +17,7 @@ from agenteval.test import Test _EVALUATOR_METHOD_MAP = { - "canoncial": CanonicalEvaluator, + "canonical": CanonicalEvaluator, } _DEFAULT_MODEL_CONFIG_MAP = { diff --git a/src/agenteval/evaluators/model_config/__init__.py b/src/agenteval/evaluators/model_config/__init__.py new file mode 100644 index 0000000..2d05e40 --- /dev/null +++ b/src/agenteval/evaluators/model_config/__init__.py @@ -0,0 +1,7 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from .bedrock_model_config import ModelProvider, BedrockModelConfig +from .preconfigured_model_configs import DEFAULT_CLAUDE_3_5_MODEL_CONFIG, DEFAULT_CLAUDE_3_MODEL_CONFIG, DEFAULT_CLAUDE_HAIKU_3_5_US_MODEL_CONFIG, DEFAULT_CLAUDE_US_3_7_MODEL_CONFIG, DEFAULT_LLAMA_3_3_70B_US_MODEL_CONFIG + +__all__ = ["ModelProvider", "BedrockModelConfig", "DEFAULT_CLAUDE_3_5_MODEL_CONFIG", "DEFAULT_CLAUDE_3_MODEL_CONFIG", "DEFAULT_CLAUDE_HAIKU_3_5_US_MODEL_CONFIG", "DEFAULT_CLAUDE_US_3_7_MODEL_CONFIG", "DEFAULT_LLAMA_3_3_70B_US_MODEL_CONFIG"] diff --git a/src/agenteval/summary.py b/src/agenteval/summary.py index a334155..2862096 100644 --- a/src/agenteval/summary.py +++ b/src/agenteval/summary.py @@ -7,6 +7,8 @@ from agenteval.metrics import calculate_pass_rate_metric from agenteval.test import Test, TestResult +from pathlib import Path + _TEMPLATE_ROOT = "summary" _TEMPLATE_FILE_NAME = "agenteval_summary.md.jinja" @@ -36,8 +38,9 @@ def create_markdown_summary( Returns: None """ - template = jinja_env.get_template(os.path.join(_TEMPLATE_ROOT, _TEMPLATE_FILE_NAME)) - summary_path = os.path.join(work_dir, os.path.splitext(_TEMPLATE_FILE_NAME)[0]) + template_path = Path(_TEMPLATE_ROOT) / _TEMPLATE_FILE_NAME + template = jinja_env.get_template(Path.as_posix(template_path)) + summary_path = Path(Path(work_dir) / Path(_TEMPLATE_FILE_NAME).stem).absolute() metrics = {"pass_rate": calculate_pass_rate_metric(pass_count, num_tests)} @@ -49,5 +52,5 @@ def create_markdown_summary( def _write_summary(path: str, summary: str): - with open(path, "w+") as f: + with open(path, "w+", encoding='utf-8') as f: f.write(summary)