Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,18 @@ Agent Evaluation is a generative AI-powered framework for testing virtual agents

Internally, Agent Evaluation implements an LLM agent (evaluator) that will orchestrate conversations with your own agent (target) and evaluate the responses during the conversation.

## Key features
## Key features

- Built-in support for popular AWS services including [Amazon Bedrock](https://aws.amazon.com/bedrock/), [Amazon Q Business](https://aws.amazon.com/q/business/), and [Amazon SageMaker](https://aws.amazon.com/sagemaker/). You can also [bring your own agent](https://awslabs.github.io/agent-evaluation/targets/custom_targets/) to test using Agent Evaluation.
- Orchestrate concurrent, multi-turn conversations with your agent while evaluating its responses.
- Define [hooks](https://awslabs.github.io/agent-evaluation/hooks/) to perform additional tasks such as integration testing.
- Can be incorporated into CI/CD pipelines to expedite the time to delivery while maintaining the stability of agents in production environments.

## 📚 Documentation
## Documentation

To get started, please visit the full documentation [here](https://awslabs.github.io/agent-evaluation/). To contribute, please refer to [CONTRIBUTING.md](./CONTRIBUTING.md)

## 👏 Contributors
## Contributors

Shout out to these awesome contributors:

Expand Down
19 changes: 8 additions & 11 deletions src/agenteval/evaluators/canonical/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
)
from agenteval.test import TestResult

from pathlib import Path

logger = logging.getLogger(__name__)

_PROMPT_TEMPLATE_ROOT = "evaluators/canonical"
_PROMPT_TEMPLATE_ROOT = Path("evaluators/canonical")
_SYSTEM_PROMPT_DIR = "system"
_RUNTIME_PROMPT_DIR = "runtime"
_PROMPT_TEMPLATE_NAMES = [
Expand Down Expand Up @@ -65,18 +67,13 @@ def __init__(
"""Initialize the evaluator."""
super().__init__(**kwargs)

system_path = _PROMPT_TEMPLATE_ROOT / _SYSTEM_PROMPT_DIR
prompt_path = _PROMPT_TEMPLATE_ROOT / _RUNTIME_PROMPT_DIR

self._prompt_template_map = {
name: {
"system": jinja_env.get_template(
os.path.join(
_PROMPT_TEMPLATE_ROOT, _SYSTEM_PROMPT_DIR, f"{name}.jinja"
)
),
"prompt": jinja_env.get_template(
os.path.join(
_PROMPT_TEMPLATE_ROOT, _RUNTIME_PROMPT_DIR, f"{name}.jinja"
)
),
"system": jinja_env.get_template(Path.as_posix(system_path / f"{name}.jinja")),
"prompt": jinja_env.get_template(Path.as_posix(prompt_path / f"{name}.jinja")),
}
for name in _PROMPT_TEMPLATE_NAMES
}
Expand Down
2 changes: 1 addition & 1 deletion src/agenteval/evaluators/evaluator_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from agenteval.test import Test

_EVALUATOR_METHOD_MAP = {
"canoncial": CanonicalEvaluator,
"canonical": CanonicalEvaluator,
}

_DEFAULT_MODEL_CONFIG_MAP = {
Expand Down
7 changes: 7 additions & 0 deletions src/agenteval/evaluators/model_config/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

from .bedrock_model_config import ModelProvider, BedrockModelConfig
from .preconfigured_model_configs import DEFAULT_CLAUDE_3_5_MODEL_CONFIG, DEFAULT_CLAUDE_3_MODEL_CONFIG, DEFAULT_CLAUDE_HAIKU_3_5_US_MODEL_CONFIG, DEFAULT_CLAUDE_US_3_7_MODEL_CONFIG, DEFAULT_LLAMA_3_3_70B_US_MODEL_CONFIG

__all__ = ["ModelProvider", "BedrockModelConfig", "DEFAULT_CLAUDE_3_5_MODEL_CONFIG", "DEFAULT_CLAUDE_3_MODEL_CONFIG", "DEFAULT_CLAUDE_HAIKU_3_5_US_MODEL_CONFIG", "DEFAULT_CLAUDE_US_3_7_MODEL_CONFIG", "DEFAULT_LLAMA_3_3_70B_US_MODEL_CONFIG"]
9 changes: 6 additions & 3 deletions src/agenteval/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from agenteval.metrics import calculate_pass_rate_metric
from agenteval.test import Test, TestResult

from pathlib import Path

_TEMPLATE_ROOT = "summary"
_TEMPLATE_FILE_NAME = "agenteval_summary.md.jinja"

Expand Down Expand Up @@ -36,8 +38,9 @@ def create_markdown_summary(
Returns:
None
"""
template = jinja_env.get_template(os.path.join(_TEMPLATE_ROOT, _TEMPLATE_FILE_NAME))
summary_path = os.path.join(work_dir, os.path.splitext(_TEMPLATE_FILE_NAME)[0])
template_path = Path(_TEMPLATE_ROOT) / _TEMPLATE_FILE_NAME
template = jinja_env.get_template(Path.as_posix(template_path))
summary_path = Path(Path(work_dir) / Path(_TEMPLATE_FILE_NAME).stem).absolute()

metrics = {"pass_rate": calculate_pass_rate_metric(pass_count, num_tests)}

Expand All @@ -49,5 +52,5 @@ def create_markdown_summary(


def _write_summary(path: str, summary: str):
with open(path, "w+") as f:
with open(path, "w+", encoding='utf-8') as f:
f.write(summary)