Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,6 @@ docs/validmind.json

# DeepEval
*.deepeval/

# Python cache
__pycache__/
10 changes: 9 additions & 1 deletion notebooks/code_samples/agents/banking_test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
"input": "Analyze credit risk for a $50,000 personal loan application with $75,000 annual income, $1,200 monthly debt, and 720 credit score",
"expected_tools": ["credit_risk_analyzer"],
"possible_outputs": ["LOW RISK", "APPROVE", "risk score", "720", "probability of default", "2.5%"],
"expected_output": "LOW RISK", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "credit_risk"
},
{
"input": "Evaluate credit risk for a business loan of $250,000 with monthly revenue of $85,000 and existing debt of $45,000 and credit score of 650",
"expected_tools": ["credit_risk_analyzer"],
"possible_outputs": ["MEDIUM RISK", "HIGH RISK", "business loan", "debt service coverage ratio", "1.8", "annual revenue", "$1,020,000", "risk score", "650"],
"expected_output": "MEDIUM RISK", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "credit_risk"
},
Expand All @@ -24,41 +26,47 @@
# possible_outputs values relevant to account management
# Matches what _handle_check_balance would return for customer 12345 ("John Smith"), whose checking_balance is 2547.89 in the mock DB.
"possible_outputs": ["$2,547.89", "John Smith", "$2547.89"],
"expected_output": "$2,547.89", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "account_management"
},
{
"input": "Analyze fraud risk for a $15,000 wire transfer from customer 67890 to Nigeria",
"expected_tools": ["fraud_detection_system"],
"possible_outputs": ["REQUIRE VERIFICATION", "fraud score", "65", "geographic risk", "block transaction", "MEDIUM RISK"],
"expected_output": "REQUIRE VERIFICATION", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "fraud_detection"
},
{
"input": "Recommend banking products for customer 11111 with $150,000 in savings and 720 credit score",
"expected_tools": ["customer_account_manager"],
"possible_outputs": ["High-Yield Savings Account (2.5% APY)", "Personal Line of Credit up to $25,000"],
"expected_output": "High-Yield Savings Account (2.5% APY)", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "account_management"
},
{
"input": "Investigate suspicious transactions totaling $75,000 across multiple accounts in the last week",
"expected_tools": ["fraud_detection_system"],
"possible_outputs": ["Require additional verification", "Implement 24-hour delay for verification"],
"expected_output": "Require additional verification", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "fraud_detection"
},
{
"input": "Assess credit risk for a $1,000,000 commercial real estate loan with $500,000 annual business income",
"expected_tools": ["credit_risk_analyzer"],
"possible_outputs": ["HIGH RISK", "VERY HIGH RISK","loan-to-value", "66.7%", "debt service coverage", "2.0"],
"possible_outputs": ["HIGH RISK", "VERY HIGH RISK", "loan-to-value", "66.7%", "debt service coverage", "2.0"],
"expected_output": "HIGH RISK", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "credit_risk"
},
{
"input": "Update customer contact information and address for account holder 22334",
"expected_tools": ["customer_account_manager"],
"possible_outputs": ["not found in system", "Customer ID 22334 not found in system.", "not found"],
"expected_output": "Customer ID 22334 not found in system.", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "account_management"
}
Expand Down
218 changes: 179 additions & 39 deletions notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,12 @@
" - [Dataframe Display Settings](#toc7_2_1__) \n",
"- [Banking Accuracy Test](#toc8__) \n",
"- [Banking Tool Call Accuracy Test](#toc9__) \n",
"- [Scorers in ValidMind](#toc10__) \n",
"- [Task Completion scorer](#toc11__) \n",
"- [Scorers in ValidMind](#toc10__)\n",
" - [Plan Quality Metric scorer](#toc10_1) \n",
" - [Plan Adherence Metric scorer](#toc10_2) \n",
" - [Tool Correctness Metric scorer](#toc10_3) \n",
" - [Argument Correctness Metric scorer](#toc10_4) \n",
" - [Task Completion scorer](#toc10_5) \n",
"- [RAGAS Tests for an Agent Evaluation](#toc12__) \n",
" - [Faithfulness](#toc12_1__) \n",
" - [Response Relevancy](#toc12_2__) \n",
Expand Down Expand Up @@ -154,7 +158,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install -q validmind langgraph"
"%pip install -q validmind "
]
},
{
Expand Down Expand Up @@ -231,7 +235,7 @@
" api_key=\"...\",\n",
" api_secret=\"...\",\n",
" model=\"...\",\n",
")\n"
")"
]
},
{
Expand Down Expand Up @@ -455,7 +459,13 @@
" Choose and use tools wisely to provide the most helpful banking assistance.\n",
" \"\"\"\n",
"# Initialize the main LLM for banking responses\n",
"main_llm = ChatOpenAI(model=\"gpt-4o-mini\", temperature=0.3)\n",
"main_llm = ChatOpenAI(\n",
" model=\"gpt-5-mini\",\n",
" reasoning={\n",
" \"effort\": \"low\",\n",
" \"summary\": \"auto\"\n",
" }\n",
")\n",
"# Bind all banking tools to the main LLM\n",
"llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)\n",
"\n",
Expand Down Expand Up @@ -539,7 +549,7 @@
"outputs": [],
"source": [
"from validmind.models import Prompt\n",
"\n",
"from validmind.scorer.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list\n",
"def banking_agent_fn(input):\n",
" \"\"\"\n",
" Invoke the banking agent with the given input.\n",
Expand Down Expand Up @@ -578,7 +588,13 @@
" tool_calls_found.append(tool_call.name)\n",
"\n",
"\n",
" return {\"prediction\": result['messages'][-1].content, \"output\": result, \"tool_messages\": [tool_message], \"tool_calls\": tool_calls_found}\n",
" return {\n",
" \"prediction\": result['messages'][-1].content[0]['text'],\n",
" \"output\": result,\n",
" \"tool_messages\": [tool_message],\n",
" # \"tool_calls\": tool_calls_found,\n",
" \"tool_called\": _convert_to_tool_call_list(extract_tool_calls_from_agent_output(result))\n",
" }\n",
" except Exception as e:\n",
" # Return a fallback response if the agent fails\n",
" error_message = f\"\"\"I apologize, but I encountered an error while processing your banking request: {str(e)}.\n",
Expand Down Expand Up @@ -720,7 +736,7 @@
"\n",
"vm_test_dataset = vm.init_dataset(\n",
" input_id=\"banking_test_dataset\",\n",
" dataset=banking_test_dataset,\n",
" dataset=banking_test_dataset.sample(2),\n",
" text_column=\"input\",\n",
" target_column=\"possible_outputs\",\n",
")\n",
Expand Down Expand Up @@ -755,28 +771,6 @@
"vm_test_dataset._df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='toc7_2_1__'></a>\n",
"\n",
"#### Dataframe Display Settings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# pd.set_option('display.max_colwidth', 40)\n",
"# pd.set_option('display.width', 120)\n",
"# pd.set_option('display.max_colwidth', None)\n",
"# print(\"Banking Test Dataset with Predictions:\")\n",
"# vm_test_dataset._df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -977,10 +971,155 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='toc11__'></a>\n",
"<a id=\"toc6_3_4_\"></a>\n",
"\n",
"### AI Agent Evaluation Metrics\n",
"\n",
"AI agent evaluation metrics are specialized measurements designed to assess how well autonomous LLM-based agents reason, plan, select and execute tools, and ultimately complete user tasks by analyzing the **full execution trace**—including reasoning steps, tool calls, intermediate decisions, and outcomes—rather than just single input–output pairs.\n",
"\n",
"These metrics are essential because agent failures often occur in ways traditional LLM metrics miss (e.g., choosing the right tool with wrong arguments, creating a good plan but not following it, or completing a task inefficiently).\n",
"\n",
"**DeepEval’s AI agent evaluation framework** breaks evaluation into three layers with corresponding metric categories:\n",
"\n",
"1. **Reasoning Layer** – Evaluates planning and strategy generation:\n",
"\n",
" * *PlanQualityMetric* – how logical, complete, and efficient the agent’s plan is\n",
" * *PlanAdherenceMetric* – whether the agent follows its own plan during execution \n",
"\n",
"2. **Action Layer** – Assesses tool usage and argument generation:\n",
"\n",
" * *ToolCorrectnessMetric* – whether the agent selects and calls the right tools\n",
" * *ArgumentCorrectnessMetric* – whether the agent generates correct tool arguments\n",
"\n",
"3. **Execution Layer** – Measures end-to-end performance:\n",
"\n",
" * *TaskCompletionMetric* – whether the agent successfully completes the intended task\n",
" * *StepEfficiencyMetric* – whether the agent avoids unnecessary or redundant steps\n",
"\n",
"Together, these metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='toc10_1'></a>\n",
"\n",
"## Task Completion scorer\n",
"#### **Reasoning Layer**\n",
"#### PlanQualityMetric\n",
"Let's measures how well the agent generates a plan before acting. A high score means the plan is logical, complete, and efficient."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"vm_test_dataset.assign_scores(\n",
" metrics = \"validmind.scorer.llm.deepeval.PlanQuality\",\n",
" input_column = \"input\",\n",
" actual_output_column = \"banking_agent_model_prediction\",\n",
" tools_called_column = \"banking_agent_model_tool_called\",\n",
" agent_output_column = \"banking_agent_model_output\",\n",
")\n",
"vm_test_dataset._df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='toc10_2'></a>\n",
"\n",
"#### PlanAdherenceMetric\n",
"Let's checks whether the agent follows the plan it created. Deviations lower this score and indicate gaps between reasoning and execution."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vm_test_dataset.assign_scores(\n",
" metrics = \"validmind.scorer.llm.deepeval.PlanAdherence\",\n",
" input_column = \"input\",\n",
" actual_output_column = \"banking_agent_model_prediction\",\n",
" expected_output_column = \"expected_output\",\n",
" tools_called_column = \"banking_agent_model_tool_called\",\n",
" agent_output_column = \"banking_agent_model_output\",\n",
"\n",
")\n",
"vm_test_dataset._df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='toc10_3'></a>\n",
"\n",
"#### **Action Layer**\n",
"#### ToolCorrectnessMetric\n",
"Let's evaluates if the agent selects the appropriate tool for the task. Choosing the wrong tool reduces performance even if reasoning was correct."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vm_test_dataset.assign_scores(\n",
" metrics = \"validmind.scorer.llm.deepeval.ToolCorrectness\",\n",
" input_column = \"input\",\n",
" actual_output_column = \"banking_agent_model_prediction\",\n",
" tools_called_column = \"banking_agent_model_tool_called\",\n",
" expected_tools_column = \"expected_tools\",\n",
" agent_output_column = \"banking_agent_model_output\",\n",
"\n",
")\n",
"vm_test_dataset._df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='toc10_4'></a>\n",
"\n",
"#### ArgumentCorrectnessMetric\n",
"Let's assesses whether the agent provides correct inputs or arguments to the selected tool. Incorrect arguments can lead to failed or unexpected results."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vm_test_dataset.assign_scores(\n",
" metrics = \"validmind.scorer.llm.deepeval.ArgumentCorrectness\",\n",
" input_column = \"input\",\n",
" actual_output_column = \"banking_agent_model_prediction\",\n",
" tools_called_column = \"banking_agent_model_tool_called\",\n",
" agent_output_column = \"banking_agent_model_output\",\n",
"\n",
")\n",
"vm_test_dataset._df.head()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='toc10_5'></a>\n",
"\n",
"#### **Execution Layer**\n",
"#### TaskCompletionMetric\n",
"The TaskCompletion test evaluates whether our banking agent successfully completes the requested tasks by analyzing its outputs and tool usage. This metric assesses the agent's ability to understand user requests, execute appropriate actions, and provide complete responses that address the original query. The test provides a score between 0-1 along with detailed feedback on task completion quality."
]
},
Expand All @@ -992,12 +1131,13 @@
"source": [
"vm_test_dataset.assign_scores(\n",
" metrics = \"validmind.scorer.llm.deepeval.TaskCompletion\",\n",
" input_column=\"input\",\n",
" tools_called_column=\"tools_called\",\n",
" actual_output_column=\"banking_agent_model_prediction\",\n",
" agent_output_column=\"banking_agent_model_output\"\n",
" )\n",
"vm_test_dataset._df.head(2)"
" input_column = \"input\",\n",
" actual_output_column = \"banking_agent_model_prediction\",\n",
" agent_output_column = \"banking_agent_model_output\",\n",
" tools_called_column = \"banking_agent_model_tool_called\",\n",
"\n",
")\n",
"vm_test_dataset._df.head()"
]
},
{
Expand Down Expand Up @@ -1323,7 +1463,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "ValidMind Library",
"display_name": "ValidMind (Poetry)",
"language": "python",
"name": "validmind"
},
Expand Down
Loading