tkestack · yuehua-s · May 27, 2025
diff --git a/examples/agent/evaluation/agent_eval.py b/examples/agent/evaluation/agent_eval.py
@@ -0,0 +1,131 @@
+import os
+import logging
+
+import deepeval
+import datetime as dt
+from typing import Any
+
+from deepeval import evaluate
+from deepeval.models import DeepEvalBaseLLM
+from deepeval.test_case import LLMTestCase, ToolCall
+from deepeval.metrics import TaskCompletionMetric
+from langfuse import Langfuse
+from langfuse.api import TraceWithDetails
+from langchain_openai import ChatOpenAI
+
+
+class DeepEvalOpenAI(DeepEvalBaseLLM):
+    def __init__(self, model):
+        self.model = model
+
+    def load_model(self):
+        return self.model
+
+    def generate(self, prompt: str) -> str:
+        chat_model = self.load_model()
+        return chat_model.invoke(prompt).content
+
+    async def a_generate(self, prompt: str) -> str:
+        chat_model = self.load_model()
+        res = await chat_model.ainvoke(prompt)
+        return res.content
+
+    def get_model_name(self):
+        return "Custom Azure OpenAI Model"
+
+
+# 拉取 traces
+def fetch_traces(langfuse_cli: Any, lookback_minutes: int) -> list[TraceWithDetails]:
+    now_timestamp = dt.datetime.now(dt.UTC)
+    from_timestamp = now_timestamp - dt.timedelta(minutes=lookback_minutes)
+    try:
+        response = langfuse_cli.fetch_traces(from_timestamp=from_timestamp, to_timestamp=now_timestamp)
+        return response.data
+    except Exception as e:
+        print(f"Failed to get traces: {e}")
+        return []
+
+
+# 使用 langchain sdk 自定义 llm
+def get_model(model_name: str) -> DeepEvalBaseLLM:
+    model = ChatOpenAI(
+        model=model_name,
+        temperature=0,
+        max_tokens=None,
+        timeout=None,
+        max_retries=2,
+        api_key=os.getenv("OPENAI_API_KEY"),
+        base_url=os.getenv("OPENAI_API_BASE"),
+    )
+    return DeepEvalOpenAI(model=model)
+
+
+def handel_traces(traces: list[TraceWithDetails]) -> list[LLMTestCase]:
+    test_cases = []
+
+    for t in traces:
+        tools_called_map = {}
+        tools_called_list = []
+        actual_output = ""
+        user_input = t.input["messages"]
+
+        if isinstance(t.output, str):
+            logging.error(t)
+        elif isinstance(t.output, dict) and "messages" in t.output:
+            for message in t.output["messages"]:
+                tool_calls = message.get("tool_calls", [])
+                if isinstance(tool_calls, list) and len(tool_calls) > 0:
+                    for tool_call in tool_calls:
+                        tools_called_map[tool_call["id"]] = ToolCall(
+                            name=tool_call["name"],
+                            input_parameters=tool_call["args"],
+                            output=None,
+                        )
+                if message["type"] == "tool":
+                    tool_call_id = message.get("tool_call_id")
+                    if tool_call_id in tools_called_map:
+                        tools_called_map[tool_call_id].output = message["content"]
+                if message["type"] == "ai" and message["response_metadata"]["finish_reason"] == "stop":
+                    actual_output = message["content"]
+
+            for _, v in tools_called_map.items():
+                tools_called_list.append(v)
+
+            test_case = LLMTestCase(
+                input=user_input,
+                actual_output=actual_output,
+                tools_called=tools_called_list,
+            )
+            test_cases.append(test_case)
+
+    return test_cases
+
+
+if __name__ == "__main__":
+    # Get keys for your project from the project settings page
+    os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-xxxxxx"  # your langfuse public key
+    os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-xxxxxx"  # your langfuse secret key
+    os.environ["LANGFUSE_HOST"] = "http://xx.xx.xx.xx"  # your langfuse host
+    os.environ["DEEPEVAL_RESULTS_FOLDER"] = "/Users/deepeval_result"  # 本地保存评估结果路径（建议）
+    CONFIDENT_API_KEY = "xxxxxxxx"  # confident ai 的 api key（可选）
+
+    llm = get_model(model_name="<YOUR_LLM_ID>")
+
+    metric = TaskCompletionMetric(
+        threshold=0.7,
+        model=llm,
+        include_reason=True
+    )
+
+    langfuse = Langfuse()
+    lookback_minutes = 30
+    traces = fetch_traces(langfuse_cli=langfuse, lookback_minutes=lookback_minutes)
+    logging.info(f"Fetched {len(traces)} traces for last {lookback_minutes} minutes.")
+
+    deepeval.login_with_confident_api_key(CONFIDENT_API_KEY)
+
+    test_cases = handel_traces(traces=traces)
+    logging.info(f"Got {len(test_cases)} test cases.")
+
+    # Evaluate end-to-end
+    evaluate(test_cases=test_cases, metrics=[metric])
diff --git a/examples/agent/evaluation/eval-actions-demo.yaml b/examples/agent/evaluation/eval-actions-demo.yaml
@@ -0,0 +1,48 @@
+name: LLM App Unit Testing
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v2
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install Poetry
+        run: |
+          curl -SSL https://install.python-poetry.org | python3 -
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+
+      - name: Install Dependencies
+        run: poetry install --no-root
+
+      - name: Set OpenAI API Key
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: echo "OPENAI_API_KEY=$OPENAI_API_KEY" >> $GITHUB_ENV
+
+      - name: Set OpenAI API Base
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_BASE }}
+        run: echo "OPENAI_API_BASE=$OPENAI_API_BASE" >> $GITHUB_ENV
+
+      - name: Set LLM
+        env:
+          OPENAI_API_KEY: ${{ secrets.LLM_ID }}
+        run: echo "LLM_ID=$LLM_ID" >> $GITHUB_ENV
+
+      - name: Login to Confident AI
+        env:
+          CONFIDENT_API_KEY: ${{ secrets.CONFIDENT_API_KEY }}
+        run: poetry run deepeval login --confident-api-key "$CONFIDENT_API_KEY"
+
+      - name: Run DeepEval Test Run
+        run: poetry run deepeval test run test_llm_app.py -i
diff --git a/examples/agent/evaluation/test_llm_app.py b/examples/agent/evaluation/test_llm_app.py
@@ -0,0 +1,129 @@
+import os
+import logging
+import pytest
+import datetime as dt
+from typing import Any
+
+from deepeval import assert_test
+from deepeval.models import DeepEvalBaseLLM
+from deepeval.test_case import LLMTestCase, ToolCall
+from deepeval.metrics import TaskCompletionMetric
+from langfuse import Langfuse
+from langfuse.api import TraceWithDetails
+from langchain_openai import ChatOpenAI
+from deepeval.dataset import EvaluationDataset
+
+
+class DeepEvalOpenAI(DeepEvalBaseLLM):
+    def __init__(self, model):
+        self.model = model
+
+    def load_model(self):
+        return self.model
+
+    def generate(self, prompt: str) -> str:
+        chat_model = self.load_model()
+        return chat_model.invoke(prompt).content
+
+    async def a_generate(self, prompt: str) -> str:
+        chat_model = self.load_model()
+        res = await chat_model.ainvoke(prompt)
+        return res.content
+
+    def get_model_name(self):
+        return "Custom Azure OpenAI Model"
+
+
+# 拉取 traces
+def fetch_traces(langfuse_cli: Any, lookback_minutes: int) -> list[TraceWithDetails]:
+    now_timestamp = dt.datetime.now(dt.UTC)
+    from_timestamp = now_timestamp - dt.timedelta(minutes=lookback_minutes)
+    try:
+        response = langfuse_cli.fetch_traces(from_timestamp=from_timestamp, to_timestamp=now_timestamp)
+        return response.data
+    except Exception as e:
+        print(f"Failed to get traces: {e}")
+        return []
+
+
+# 使用 langchain sdk 自定义 llm
+def get_model(model_name: str) -> DeepEvalBaseLLM:
+    model = ChatOpenAI(
+        model=model_name,
+        temperature=0,
+        max_tokens=None,
+        timeout=None,
+        max_retries=2,
+        api_key=os.getenv("OPENAI_API_KEY"),
+        base_url=os.getenv("OPENAI_API_BASE"),
+    )
+    return DeepEvalOpenAI(model=model)
+
+
+# Get keys for your project from the project settings page
+os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-xxxxxx"  # your langfuse public key
+os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-xxxxxx"  # your langfuse secret key
+os.environ["LANGFUSE_HOST"] = "http://xx.xx.xx.xx"  # your langfuse host
+os.environ["DEEPEVAL_RESULTS_FOLDER"] = "/Users/deepeval_result"  # 本地保存评估结果路径
+
+llm = get_model(model_name=os.getenv("LLM_ID"))
+
+metric = TaskCompletionMetric(
+    threshold=0.7,
+    model=llm,
+    include_reason=True
+)
+
+langfuse = Langfuse()
+lookback_minutes = 30
+traces = fetch_traces(langfuse_cli=langfuse, lookback_minutes=lookback_minutes)
+logging.info(f"Fetched {len(traces)} traces for last {lookback_minutes} minutes.")
+
+test_cases = []
+
+for t in traces:
+    tools_called_map = {}
+    tools_called_list = []
+    actual_output = ""
+    user_input = t.input["messages"]
+
+    if isinstance(t.output, str):
+        logging.error(t)
+    elif isinstance(t.output, dict) and "messages" in t.output:
+        for message in t.output["messages"]:
+            tool_calls = message.get("tool_calls", [])
+            if isinstance(tool_calls, list) and len(tool_calls) > 0:
+                for tool_call in tool_calls:
+                    tools_called_map[tool_call["id"]] = ToolCall(
+                        name=tool_call["name"],
+                        input_parameters=tool_call["args"],
+                        output=None,
+                    )
+            if message["type"] == "tool":
+                tool_call_id = message.get("tool_call_id")
+                if tool_call_id in tools_called_map:
+                    tools_called_map[tool_call_id].output = message["content"]
+            if message["type"] == "ai" and message["response_metadata"]["finish_reason"] == "stop":
+                actual_output = message["content"]
+
+        for _, v in tools_called_map.items():
+            tools_called_list.append(v)
+
+        test_case = LLMTestCase(
+            input=user_input,
+            actual_output=actual_output,
+            tools_called=tools_called_list,
+        )
+        test_cases.append(test_case)
+        dataset = EvaluationDataset(test_cases=test_cases)
+
+logging.info(f"Got {len(test_cases)} test cases.")
+
+
+# Loop through test cases
+@pytest.mark.parametrize("test_case", dataset)
+def test_llm_app(test_case: LLMTestCase):
+    assert_test(test_case, [metric])
+
+# RUN CMD
+# deepeval test run llm-app-eval/test_llm_app.py -i
diff --git a/examples/agent/langgraph-agent.py b/examples/agent/langgraph-agent.py
@@ -0,0 +1,52 @@
+import os
+import asyncio
+
+from langchain_openai import ChatOpenAI
+from langgraph.prebuilt import create_react_agent
+from langchain_mcp_adapters.client import MultiServerMCPClient
+from langfuse.callback import CallbackHandler
+
+
+# react agent + mcp
+async def multi_tool_demo(model: ChatOpenAI, query: str, config: dict):
+    async with MultiServerMCPClient({
+        "math": {
+            "command": "python",
+            # Make sure to update to the full absolute path to your math.py file
+            "args": ["math_server.py"],
+            "transport": "stdio",
+        },
+    }) as client:
+        agent = create_react_agent(model, client.get_tools())
+        try:
+            response = await agent.ainvoke({"messages": query}, config=config)
+            print(f"\n工具调用结果（query: {query}）：")
+            for m in response['messages']:
+                m.pretty_print()
+        except Exception as e:
+            print(f"工具调用出错: {e}")
+
+if __name__ == "__main__":
+    # get keys for your project
+    os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-***"  # your langfuse public key
+    os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-***"  # your langfuse secret key
+    os.environ["LANGFUSE_HOST"] = "http://xx.xx.xx.xx"  # your langfuse host
+
+    query = "今有雉兔同笼，上有三十五头，下有九十四足，问雉兔各几何？(请使用我给你提供的工具)"
+
+    # init model
+    model = ChatOpenAI(
+        model="<YOUR_LLM_ID>",
+        api_key=os.getenv("OPENAI_API_KEY"),
+        base_url=os.getenv("OPENAI_API_BASE"),
+    )
+
+    # Initialize Langfuse CallbackHandler for Langchain (tracing)
+    langfuse_handler = CallbackHandler()
+    config = {"callbacks": [langfuse_handler]}
+
+    # invoke agent
+    async def run_tools():
+        await multi_tool_demo(model=model, query=query, config=config)
+
+    asyncio.run(run_tools())
diff --git a/examples/agent/math_server.py b/examples/agent/math_server.py
@@ -0,0 +1,33 @@
+from mcp.server.fastmcp import FastMCP
+
+mcp = FastMCP("Math")
+
+
+@mcp.tool()
+def add(a: int, b: int) -> int:
+    """Add two numbers"""
+    return a + b
+
+
+@mcp.tool()
+def subtract(a: int, b: int) -> int:
+    """Subtract b from a"""
+    return a - b
+
+
+@mcp.tool()
+def multiply(a: int, b: int) -> int:
+    """Multiply two numbers"""
+    return a * b
+
+
+@mcp.tool()
+def divide(a: int, b: int) -> float:
+    """Divide a by b"""
+    if b == 0:
+        raise ValueError("Division by zero is not allowed.")
+    return a / b
+
+
+if __name__ == "__main__":
+    mcp.run(transport="stdio")