diff --git a/notebooks/code_samples/agents/banking_test_dataset.py b/notebooks/code_samples/agents/banking_test_dataset.py index bd2793169..895b1e97f 100644 --- a/notebooks/code_samples/agents/banking_test_dataset.py +++ b/notebooks/code_samples/agents/banking_test_dataset.py @@ -12,14 +12,6 @@ "session_id": str(uuid.uuid4()), "category": "credit_risk" }, - { - "input": "Evaluate credit risk for a business loan of $250,000 with monthly revenue of $85,000 and existing debt of $45,000 and credit score of 650", - "expected_tools": ["credit_risk_analyzer"], - "possible_outputs": ["MEDIUM RISK", "HIGH RISK", "business loan", "debt service coverage ratio", "1.8", "annual revenue", "$1,020,000", "risk score", "650"], - "expected_output": "MEDIUM RISK", # Example, adjust as needed - "session_id": str(uuid.uuid4()), - "category": "credit_risk" - }, { "input": "Check account balance for checking account 12345", "expected_tools": ["customer_account_manager"], @@ -45,29 +37,5 @@ "expected_output": "High-Yield Savings Account (2.5% APY)", # Example, adjust as needed "session_id": str(uuid.uuid4()), "category": "account_management" - }, - { - "input": "Investigate suspicious transactions totaling $75,000 across multiple accounts in the last week", - "expected_tools": ["fraud_detection_system"], - "possible_outputs": ["Require additional verification", "Implement 24-hour delay for verification"], - "expected_output": "Require additional verification", # Example, adjust as needed - "session_id": str(uuid.uuid4()), - "category": "fraud_detection" - }, - { - "input": "Assess credit risk for a $1,000,000 commercial real estate loan with $500,000 annual business income", - "expected_tools": ["credit_risk_analyzer"], - "possible_outputs": ["HIGH RISK", "VERY HIGH RISK", "loan-to-value", "66.7%", "debt service coverage", "2.0"], - "expected_output": "HIGH RISK", # Example, adjust as needed - "session_id": str(uuid.uuid4()), - "category": "credit_risk" - }, - { - "input": "Update customer contact information and address for account holder 22334", - "expected_tools": ["customer_account_manager"], - "possible_outputs": ["not found in system", "Customer ID 22334 not found in system.", "not found"], - "expected_output": "Customer ID 22334 not found in system.", # Example, adjust as needed - "session_id": str(uuid.uuid4()), - "category": "account_management" } ]) diff --git a/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb b/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb index 9afebb2e6..46a3aa9e8 100644 --- a/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb +++ b/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb @@ -158,7 +158,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install -q \"validmind[llm]\" " + "%pip install -q \"validmind[llm]\" \"langgraph==0.3.21\"" ] }, { @@ -457,6 +457,9 @@ " - Be professional and thorough in your analysis\n", "\n", " Choose and use tools wisely to provide the most helpful banking assistance.\n", + " Describe the response in user friendly manner with details describing the tool output. \n", + " Provide the response in at least 500 words.\n", + " Generate a concise execution plan for the banking request.\n", " \"\"\"\n", "# Initialize the main LLM for banking responses\n", "main_llm = ChatOpenAI(\n", @@ -736,7 +739,7 @@ "\n", "vm_test_dataset = vm.init_dataset(\n", " input_id=\"banking_test_dataset\",\n", - " dataset=banking_test_dataset.sample(2),\n", + " dataset=banking_test_dataset,\n", " text_column=\"input\",\n", " target_column=\"possible_outputs\",\n", ")\n", @@ -768,7 +771,7 @@ "\n", "print(\"Banking Agent Predictions Generated Successfully!\")\n", "print(f\"Predictions assigned to {len(vm_test_dataset._df)} test cases\")\n", - "vm_test_dataset._df.head()" + "vm_test_dataset._df" ] }, { @@ -840,6 +843,15 @@ "result.log()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.df.head(5)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -994,7 +1006,6 @@ "3. **Execution Layer** – Measures end-to-end performance:\n", "\n", " * *TaskCompletionMetric* – whether the agent successfully completes the intended task\n", - " * *StepEfficiencyMetric* – whether the agent avoids unnecessary or redundant steps\n", "\n", "Together, these metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." ] @@ -1108,8 +1119,7 @@ " agent_output_column = \"banking_agent_model_output\",\n", "\n", ")\n", - "vm_test_dataset._df.head()\n", - "\n" + "vm_test_dataset._df.head()" ] }, { @@ -1162,7 +1172,7 @@ " \"ylabel\": \"Score\",\n", " \"figsize\": (8, 6)\n", " }\n", - ").log()\n" + ").log()" ] }, { @@ -1383,31 +1393,6 @@ ").log()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Toxicity\n", - "\n", - "Let's ensure responses are professional and appropriate for banking contexts." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.Toxicity\",\n", - " inputs={\n", - " \"dataset\": vm_test_dataset,\n", - " },\n", - ").log()" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1479,9 +1464,9 @@ ], "metadata": { "kernelspec": { - "display_name": "ValidMind (Poetry)", + "display_name": "validmind-1QuffXMV-py3.11", "language": "python", - "name": "validmind" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/notebooks/code_sharing/deepeval_integration_demo.ipynb b/notebooks/code_sharing/deepeval_integration_demo.ipynb index d7fd31f85..ac8d0bafc 100644 --- a/notebooks/code_sharing/deepeval_integration_demo.ipynb +++ b/notebooks/code_sharing/deepeval_integration_demo.ipynb @@ -847,7 +847,6 @@ "3. **Execution Layer** – Measures end-to-end performance:\n", "\n", " * *TaskCompletionMetric* – whether the agent successfully completes the intended task\n", - " * *StepEfficiencyMetric* – whether the agent avoids unnecessary or redundant steps\n", "\n", "Together, these metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." ] diff --git a/validmind/tests/load.py b/validmind/tests/load.py index 4dc97d11a..9a9f13c53 100644 --- a/validmind/tests/load.py +++ b/validmind/tests/load.py @@ -127,7 +127,9 @@ def _inspect_signature( return inputs, params -def _get_test_function_from_provider(test_id: str, namespace: str) -> Callable[..., Any]: +def _get_test_function_from_provider( + test_id: str, namespace: str +) -> Callable[..., Any]: """Load a test function from the appropriate provider or scorer store. Args: @@ -146,9 +148,7 @@ def _get_test_function_from_provider(test_id: str, namespace: str) -> Callable[. return custom_scorer if not test_provider_store.has_test_provider(namespace): - raise LoadTestError( - f"No test provider found for namespace: {namespace}" - ) + raise LoadTestError(f"No test provider found for namespace: {namespace}") provider = test_provider_store.get_test_provider(namespace)