Skip to content

feature:agent and eval demo #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions examples/agent/evaluation/agent_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import os
import logging

import deepeval
import datetime as dt
from typing import Any

from deepeval import evaluate
from deepeval.models import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase, ToolCall
from deepeval.metrics import TaskCompletionMetric
from langfuse import Langfuse
from langfuse.api import TraceWithDetails
from langchain_openai import ChatOpenAI


class DeepEvalOpenAI(DeepEvalBaseLLM):
def __init__(self, model):
self.model = model

def load_model(self):
return self.model

def generate(self, prompt: str) -> str:
chat_model = self.load_model()
return chat_model.invoke(prompt).content

async def a_generate(self, prompt: str) -> str:
chat_model = self.load_model()
res = await chat_model.ainvoke(prompt)
return res.content

def get_model_name(self):
return "Custom Azure OpenAI Model"


# 拉取 traces
def fetch_traces(langfuse_cli: Any, lookback_minutes: int) -> list[TraceWithDetails]:
now_timestamp = dt.datetime.now(dt.UTC)
from_timestamp = now_timestamp - dt.timedelta(minutes=lookback_minutes)
try:
response = langfuse_cli.fetch_traces(from_timestamp=from_timestamp, to_timestamp=now_timestamp)
return response.data
except Exception as e:
print(f"Failed to get traces: {e}")
return []


# 使用 langchain sdk 自定义 llm
def get_model(model_name: str) -> DeepEvalBaseLLM:
model = ChatOpenAI(
model=model_name,
temperature=0,
max_tokens=None,
timeout=None,
max_retries=2,
api_key=os.getenv("OPENAI_API_KEY"),
base_url=os.getenv("OPENAI_API_BASE"),
)
return DeepEvalOpenAI(model=model)


def handel_traces(traces: list[TraceWithDetails]) -> list[LLMTestCase]:
test_cases = []

for t in traces:
tools_called_map = {}
tools_called_list = []
actual_output = ""
user_input = t.input["messages"]

if isinstance(t.output, str):
logging.error(t)
elif isinstance(t.output, dict) and "messages" in t.output:
for message in t.output["messages"]:
tool_calls = message.get("tool_calls", [])
if isinstance(tool_calls, list) and len(tool_calls) > 0:
for tool_call in tool_calls:
tools_called_map[tool_call["id"]] = ToolCall(
name=tool_call["name"],
input_parameters=tool_call["args"],
output=None,
)
if message["type"] == "tool":
tool_call_id = message.get("tool_call_id")
if tool_call_id in tools_called_map:
tools_called_map[tool_call_id].output = message["content"]
if message["type"] == "ai" and message["response_metadata"]["finish_reason"] == "stop":
actual_output = message["content"]

for _, v in tools_called_map.items():
tools_called_list.append(v)

test_case = LLMTestCase(
input=user_input,
actual_output=actual_output,
tools_called=tools_called_list,
)
test_cases.append(test_case)

return test_cases


if __name__ == "__main__":
# Get keys for your project from the project settings page
os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-xxxxxx" # your langfuse public key
os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-xxxxxx" # your langfuse secret key
os.environ["LANGFUSE_HOST"] = "http://xx.xx.xx.xx" # your langfuse host
os.environ["DEEPEVAL_RESULTS_FOLDER"] = "/Users/deepeval_result" # 本地保存评估结果路径(建议)
CONFIDENT_API_KEY = "xxxxxxxx" # confident ai 的 api key(可选)

llm = get_model(model_name="<YOUR_LLM_ID>")

metric = TaskCompletionMetric(
threshold=0.7,
model=llm,
include_reason=True
)

langfuse = Langfuse()
lookback_minutes = 30
traces = fetch_traces(langfuse_cli=langfuse, lookback_minutes=lookback_minutes)
logging.info(f"Fetched {len(traces)} traces for last {lookback_minutes} minutes.")

deepeval.login_with_confident_api_key(CONFIDENT_API_KEY)

test_cases = handel_traces(traces=traces)
logging.info(f"Got {len(test_cases)} test cases.")

# Evaluate end-to-end
evaluate(test_cases=test_cases, metrics=[metric])
48 changes: 48 additions & 0 deletions examples/agent/evaluation/eval-actions-demo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: LLM App Unit Testing

on:
push:
pull_request:

jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Checkout Code
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"

- name: Install Poetry
run: |
curl -SSL https://install.python-poetry.org | python3 -
echo "$HOME/.local/bin" >> $GITHUB_PATH

- name: Install Dependencies
run: poetry install --no-root

- name: Set OpenAI API Key
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: echo "OPENAI_API_KEY=$OPENAI_API_KEY" >> $GITHUB_ENV

- name: Set OpenAI API Base
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_BASE }}
run: echo "OPENAI_API_BASE=$OPENAI_API_BASE" >> $GITHUB_ENV

- name: Set LLM
env:
OPENAI_API_KEY: ${{ secrets.LLM_ID }}
run: echo "LLM_ID=$LLM_ID" >> $GITHUB_ENV

- name: Login to Confident AI
env:
CONFIDENT_API_KEY: ${{ secrets.CONFIDENT_API_KEY }}
run: poetry run deepeval login --confident-api-key "$CONFIDENT_API_KEY"

- name: Run DeepEval Test Run
run: poetry run deepeval test run test_llm_app.py -i
129 changes: 129 additions & 0 deletions examples/agent/evaluation/test_llm_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import os
import logging
import pytest
import datetime as dt
from typing import Any

from deepeval import assert_test
from deepeval.models import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase, ToolCall
from deepeval.metrics import TaskCompletionMetric
from langfuse import Langfuse
from langfuse.api import TraceWithDetails
from langchain_openai import ChatOpenAI
from deepeval.dataset import EvaluationDataset


class DeepEvalOpenAI(DeepEvalBaseLLM):
def __init__(self, model):
self.model = model

def load_model(self):
return self.model

def generate(self, prompt: str) -> str:
chat_model = self.load_model()
return chat_model.invoke(prompt).content

async def a_generate(self, prompt: str) -> str:
chat_model = self.load_model()
res = await chat_model.ainvoke(prompt)
return res.content

def get_model_name(self):
return "Custom Azure OpenAI Model"


# 拉取 traces
def fetch_traces(langfuse_cli: Any, lookback_minutes: int) -> list[TraceWithDetails]:
now_timestamp = dt.datetime.now(dt.UTC)
from_timestamp = now_timestamp - dt.timedelta(minutes=lookback_minutes)
try:
response = langfuse_cli.fetch_traces(from_timestamp=from_timestamp, to_timestamp=now_timestamp)
return response.data
except Exception as e:
print(f"Failed to get traces: {e}")
return []


# 使用 langchain sdk 自定义 llm
def get_model(model_name: str) -> DeepEvalBaseLLM:
model = ChatOpenAI(
model=model_name,
temperature=0,
max_tokens=None,
timeout=None,
max_retries=2,
api_key=os.getenv("OPENAI_API_KEY"),
base_url=os.getenv("OPENAI_API_BASE"),
)
return DeepEvalOpenAI(model=model)


# Get keys for your project from the project settings page
os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-xxxxxx" # your langfuse public key
os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-xxxxxx" # your langfuse secret key
os.environ["LANGFUSE_HOST"] = "http://xx.xx.xx.xx" # your langfuse host
os.environ["DEEPEVAL_RESULTS_FOLDER"] = "/Users/deepeval_result" # 本地保存评估结果路径

llm = get_model(model_name=os.getenv("LLM_ID"))

metric = TaskCompletionMetric(
threshold=0.7,
model=llm,
include_reason=True
)

langfuse = Langfuse()
lookback_minutes = 30
traces = fetch_traces(langfuse_cli=langfuse, lookback_minutes=lookback_minutes)
logging.info(f"Fetched {len(traces)} traces for last {lookback_minutes} minutes.")

test_cases = []

for t in traces:
tools_called_map = {}
tools_called_list = []
actual_output = ""
user_input = t.input["messages"]

if isinstance(t.output, str):
logging.error(t)
elif isinstance(t.output, dict) and "messages" in t.output:
for message in t.output["messages"]:
tool_calls = message.get("tool_calls", [])
if isinstance(tool_calls, list) and len(tool_calls) > 0:
for tool_call in tool_calls:
tools_called_map[tool_call["id"]] = ToolCall(
name=tool_call["name"],
input_parameters=tool_call["args"],
output=None,
)
if message["type"] == "tool":
tool_call_id = message.get("tool_call_id")
if tool_call_id in tools_called_map:
tools_called_map[tool_call_id].output = message["content"]
if message["type"] == "ai" and message["response_metadata"]["finish_reason"] == "stop":
actual_output = message["content"]

for _, v in tools_called_map.items():
tools_called_list.append(v)

test_case = LLMTestCase(
input=user_input,
actual_output=actual_output,
tools_called=tools_called_list,
)
test_cases.append(test_case)
dataset = EvaluationDataset(test_cases=test_cases)

logging.info(f"Got {len(test_cases)} test cases.")


# Loop through test cases
@pytest.mark.parametrize("test_case", dataset)
def test_llm_app(test_case: LLMTestCase):
assert_test(test_case, [metric])

# RUN CMD
# deepeval test run llm-app-eval/test_llm_app.py -i
52 changes: 52 additions & 0 deletions examples/agent/langgraph-agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
import asyncio

from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
from langchain_mcp_adapters.client import MultiServerMCPClient
from langfuse.callback import CallbackHandler


# react agent + mcp
async def multi_tool_demo(model: ChatOpenAI, query: str, config: dict):
async with MultiServerMCPClient({
"math": {
"command": "python",
# Make sure to update to the full absolute path to your math.py file
"args": ["math_server.py"],
"transport": "stdio",
},
}) as client:
agent = create_react_agent(model, client.get_tools())
try:
response = await agent.ainvoke({"messages": query}, config=config)
print(f"\n工具调用结果(query: {query}):")
for m in response['messages']:
m.pretty_print()
except Exception as e:
print(f"工具调用出错: {e}")

if __name__ == "__main__":
# get keys for your project
os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-***" # your langfuse public key
os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-***" # your langfuse secret key
os.environ["LANGFUSE_HOST"] = "http://xx.xx.xx.xx" # your langfuse host

query = "今有雉兔同笼,上有三十五头,下有九十四足,问雉兔各几何?(请使用我给你提供的工具)"

# init model
model = ChatOpenAI(
model="<YOUR_LLM_ID>",
api_key=os.getenv("OPENAI_API_KEY"),
base_url=os.getenv("OPENAI_API_BASE"),
)

# Initialize Langfuse CallbackHandler for Langchain (tracing)
langfuse_handler = CallbackHandler()
config = {"callbacks": [langfuse_handler]}

# invoke agent
async def run_tools():
await multi_tool_demo(model=model, query=query, config=config)

asyncio.run(run_tools())
33 changes: 33 additions & 0 deletions examples/agent/math_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from mcp.server.fastmcp import FastMCP

mcp = FastMCP("Math")


@mcp.tool()
def add(a: int, b: int) -> int:
"""Add two numbers"""
return a + b


@mcp.tool()
def subtract(a: int, b: int) -> int:
"""Subtract b from a"""
return a - b


@mcp.tool()
def multiply(a: int, b: int) -> int:
"""Multiply two numbers"""
return a * b


@mcp.tool()
def divide(a: int, b: int) -> float:
"""Divide a by b"""
if b == 0:
raise ValueError("Division by zero is not allowed.")
return a / b


if __name__ == "__main__":
mcp.run(transport="stdio")
Loading