Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 0 additions & 32 deletions notebooks/code_samples/agents/banking_test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,6 @@
"session_id": str(uuid.uuid4()),
"category": "credit_risk"
},
{
"input": "Evaluate credit risk for a business loan of $250,000 with monthly revenue of $85,000 and existing debt of $45,000 and credit score of 650",
"expected_tools": ["credit_risk_analyzer"],
"possible_outputs": ["MEDIUM RISK", "HIGH RISK", "business loan", "debt service coverage ratio", "1.8", "annual revenue", "$1,020,000", "risk score", "650"],
"expected_output": "MEDIUM RISK", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "credit_risk"
},
{
"input": "Check account balance for checking account 12345",
"expected_tools": ["customer_account_manager"],
Expand All @@ -45,29 +37,5 @@
"expected_output": "High-Yield Savings Account (2.5% APY)", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "account_management"
},
{
"input": "Investigate suspicious transactions totaling $75,000 across multiple accounts in the last week",
"expected_tools": ["fraud_detection_system"],
"possible_outputs": ["Require additional verification", "Implement 24-hour delay for verification"],
"expected_output": "Require additional verification", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "fraud_detection"
},
{
"input": "Assess credit risk for a $1,000,000 commercial real estate loan with $500,000 annual business income",
"expected_tools": ["credit_risk_analyzer"],
"possible_outputs": ["HIGH RISK", "VERY HIGH RISK", "loan-to-value", "66.7%", "debt service coverage", "2.0"],
"expected_output": "HIGH RISK", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "credit_risk"
},
{
"input": "Update customer contact information and address for account holder 22334",
"expected_tools": ["customer_account_manager"],
"possible_outputs": ["not found in system", "Customer ID 22334 not found in system.", "not found"],
"expected_output": "Customer ID 22334 not found in system.", # Example, adjust as needed
"session_id": str(uuid.uuid4()),
"category": "account_management"
}
])
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install -q \"validmind[llm]\" "
"%pip install -q \"validmind[llm]\" \"langgraph==0.3.21\""
]
},
{
Expand Down Expand Up @@ -457,6 +457,9 @@
" - Be professional and thorough in your analysis\n",
"\n",
" Choose and use tools wisely to provide the most helpful banking assistance.\n",
" Describe the response in user friendly manner with details describing the tool output. \n",
" Provide the response in at least 500 words.\n",
" Generate a concise execution plan for the banking request.\n",
" \"\"\"\n",
"# Initialize the main LLM for banking responses\n",
"main_llm = ChatOpenAI(\n",
Expand Down Expand Up @@ -736,7 +739,7 @@
"\n",
"vm_test_dataset = vm.init_dataset(\n",
" input_id=\"banking_test_dataset\",\n",
" dataset=banking_test_dataset.sample(2),\n",
" dataset=banking_test_dataset,\n",
" text_column=\"input\",\n",
" target_column=\"possible_outputs\",\n",
")\n",
Expand Down Expand Up @@ -768,7 +771,7 @@
"\n",
"print(\"Banking Agent Predictions Generated Successfully!\")\n",
"print(f\"Predictions assigned to {len(vm_test_dataset._df)} test cases\")\n",
"vm_test_dataset._df.head()"
"vm_test_dataset._df"
]
},
{
Expand Down Expand Up @@ -840,6 +843,15 @@
"result.log()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vm_test_dataset.df.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -994,7 +1006,6 @@
"3. **Execution Layer** – Measures end-to-end performance:\n",
"\n",
" * *TaskCompletionMetric* – whether the agent successfully completes the intended task\n",
" * *StepEfficiencyMetric* – whether the agent avoids unnecessary or redundant steps\n",
"\n",
"Together, these metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring."
]
Expand Down Expand Up @@ -1108,8 +1119,7 @@
" agent_output_column = \"banking_agent_model_output\",\n",
"\n",
")\n",
"vm_test_dataset._df.head()\n",
"\n"
"vm_test_dataset._df.head()"
]
},
{
Expand Down Expand Up @@ -1162,7 +1172,7 @@
" \"ylabel\": \"Score\",\n",
" \"figsize\": (8, 6)\n",
" }\n",
").log()\n"
").log()"
]
},
{
Expand Down Expand Up @@ -1383,31 +1393,6 @@
").log()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='toc13_3__'></a>\n",
"\n",
"### Toxicity\n",
"\n",
"Let's ensure responses are professional and appropriate for banking contexts."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run_test(\n",
" \"validmind.data_validation.nlp.Toxicity\",\n",
" inputs={\n",
" \"dataset\": vm_test_dataset,\n",
" },\n",
").log()"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -1479,9 +1464,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "ValidMind (Poetry)",
"display_name": "validmind-1QuffXMV-py3.11",
"language": "python",
"name": "validmind"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand Down
1 change: 0 additions & 1 deletion notebooks/code_sharing/deepeval_integration_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -847,7 +847,6 @@
"3. **Execution Layer** – Measures end-to-end performance:\n",
"\n",
" * *TaskCompletionMetric* – whether the agent successfully completes the intended task\n",
" * *StepEfficiencyMetric* – whether the agent avoids unnecessary or redundant steps\n",
"\n",
"Together, these metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring."
]
Expand Down
8 changes: 4 additions & 4 deletions validmind/tests/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,9 @@ def _inspect_signature(
return inputs, params


def _get_test_function_from_provider(test_id: str, namespace: str) -> Callable[..., Any]:
def _get_test_function_from_provider(
test_id: str, namespace: str
) -> Callable[..., Any]:
"""Load a test function from the appropriate provider or scorer store.

Args:
Expand All @@ -146,9 +148,7 @@ def _get_test_function_from_provider(test_id: str, namespace: str) -> Callable[.
return custom_scorer

if not test_provider_store.has_test_provider(namespace):
raise LoadTestError(
f"No test provider found for namespace: {namespace}"
)
raise LoadTestError(f"No test provider found for namespace: {namespace}")

provider = test_provider_store.get_test_provider(namespace)

Expand Down