From 6024baf9b28f7342776807c8e2215989f488cb1a Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 11:27:12 -0800 Subject: [PATCH 01/54] Creating new version of agentic AI notebook --- .../code_sharing/document_agentic_ai.ipynb | 332 ++++++++++++++++++ 1 file changed, 332 insertions(+) create mode 100644 notebooks/code_sharing/document_agentic_ai.ipynb diff --git a/notebooks/code_sharing/document_agentic_ai.ipynb b/notebooks/code_sharing/document_agentic_ai.ipynb new file mode 100644 index 000000000..daf171891 --- /dev/null +++ b/notebooks/code_sharing/document_agentic_ai.ipynb @@ -0,0 +1,332 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "382caa31", + "metadata": {}, + "source": [ + "# Document an agentic AI system" + ] + }, + { + "cell_type": "markdown", + "id": "about-intro-d367bebc-fcd0-46df-93ba-5f731f4b9ba9", + "metadata": {}, + "source": [ + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. \n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators." + ] + }, + { + "cell_type": "markdown", + "id": "about-begin-c1d3d399-73ca-4a88-9ca8-16603eab0751", + "metadata": {}, + "source": [ + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html)." + ] + }, + { + "cell_type": "markdown", + "id": "about-signup-65fed156-8e69-49eb-b04e-ae52c574d8e7", + "metadata": {}, + "source": [ + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
" + ] + }, + { + "cell_type": "markdown", + "id": "about-concepts-f49034ef-bfb8-4643-88dc-5647eb76968a", + "metadata": {}, + "source": [ + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Metrics**: A subset of tests that do not have thresholds. In the context of this notebook, metrics and tests can be thought of as interchangeable concepts.\n", + "\n", + "**Custom metrics**: Custom metrics are functions that you define to evaluate your model or dataset. These functions can be registered with the ValidMind Library to be used in the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom metrics can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "id": "cell-d35096df-f3f8-4034-87fa-1973bbdd7b49-efbed180-2181-4735-83c9-7d12b69b9f4c", + "metadata": {}, + "source": [ + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "id": "install-library-20ef5375-d71d-47b1-b669-ed90cb3f0e90", + "metadata": {}, + "source": [ + "### Install the ValidMind Library\n", + "\n", + "
Recommended Python versions\n", + "

\n", + "Python 3.8 <= x <= 3.11
\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-python-94668f4a-272a-4d93-b7c2-da735557fd37", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "id": "install-initialize-13c50d00-6167-4147-a886-541d73129898", + "metadata": {}, + "source": [ + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "id": "install-register-ab56529b-d2f2-4942-b18e-603fb7342e5b", + "metadata": {}, + "source": [ + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook.\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "id": "install-template-6e8d6279-0edf-442a-bb63-50f7f89e60b3", + "metadata": {}, + "source": [ + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Agentic AI`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "id": "install-snippet-468eb7c3-612b-4f14-96aa-dc5a3088448c", + "metadata": {}, + "source": [ + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-init-9b9b870b-01e8-428d-afbf-8e3d9771b3f4", + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "install-preview-4e403693-e030-4b31-9d39-74a6af8f470f", + "metadata": {}, + "source": [ + "### Preview the documentation template\n", + "\n", + "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", + "\n", + "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-preview-template-b5149386-4ab1-41cb-9527-880573d91509", + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "id": "next-steps-24e25294-3e59-4982-92d0-a998cfbe3bb5", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "You can look at the output produced by the ValidMind Library right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation." + ] + }, + { + "cell_type": "markdown", + "id": "next-docs-7773f25a-ef1c-40d1-9b4b-f6cecdf7dc6c", + "metadata": {}, + "source": [ + "### Work with your model documentation\n", + "\n", + "1. From the **Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", + "\n", + "2. In the left sidebar that appears for your model, click **Documentation** under Documents.\n", + "\n", + "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)" + ] + }, + { + "cell_type": "markdown", + "id": "next-resources-779f3903-eb13-4242-9438-e0dfc1753d4d", + "metadata": {}, + "source": [ + "### Discover more learning resources\n", + "\n", + "We offer many interactive notebooks to help you document models:\n", + "\n", + "- [Run tests & test suites](https://docs.validmind.ai/guide/testing-overview.html)\n", + "- [Code samples](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-vm-362dbd51-d699-405b-b817-f95381861976", + "metadata": {}, + "source": [ + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "upgrade-show-2ce32ad2-2751-4b40-a1ec-161b178603b1", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-version-61359c61-7218-4a90-9f6f-70794c42d997", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-restart-c45548b7-aee3-4b96-b412-37c781071787", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-99bac407-03cd-4fe3-83c8-0a280d4caa64", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 570cebf0fb2a9086ce221f147f839f57d011e029 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 11:48:54 -0800 Subject: [PATCH 02/54] Edit: Intro --- notebooks/code_sharing/document_agentic_ai.ipynb | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/notebooks/code_sharing/document_agentic_ai.ipynb b/notebooks/code_sharing/document_agentic_ai.ipynb index daf171891..4c1f3e5a3 100644 --- a/notebooks/code_sharing/document_agentic_ai.ipynb +++ b/notebooks/code_sharing/document_agentic_ai.ipynb @@ -5,7 +5,14 @@ "id": "382caa31", "metadata": {}, "source": [ - "# Document an agentic AI system" + "# Document an agentic AI system\n", + "\n", + "Build and document an agentic AI system with the ValidMind Library. Construct a LangGraph-based banking agent, prepare your datasets and agent for testing, run out-of-the-box and custom tests and log those test results to the ValidMind Platform.\n", + "\n", + "An _AI agent_ is an autonomous system that interprets inputs, selects from available tools or actions, and executes multi-step behaviors to achieve defined goals. In this notebook, the agent acts as a banking assistant that analyzes requests and automatically selects and invokes the appropriate specialized banking tool to deliver accurate, compliant, and actionable responses.\n", + "\n", + "- This agent enables financial institutions to automate complex banking workflows where different customer requests require different specialized tools and knowledge bases.\n", + "- Effective validation of agentic AI systems reduces the risks of agents misinterpreting inputs, failing to extract required parameters, or producing incorrect assessments or actions — such as selecting the wrong tool." ] }, { From 08ae74d13eea61a31378aaeee2c7c58cb72fe788 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 13:14:27 -0800 Subject: [PATCH 03/54] Save point --- .../agents/agentic_ai_template.yaml | 339 +++++++++++++++ .../agents/document_agentic_ai.ipynb | 399 ++++++++++++++++++ .../code_sharing/document_agentic_ai.ipynb | 339 --------------- 3 files changed, 738 insertions(+), 339 deletions(-) create mode 100644 notebooks/code_samples/agents/agentic_ai_template.yaml create mode 100644 notebooks/code_samples/agents/document_agentic_ai.ipynb delete mode 100644 notebooks/code_sharing/document_agentic_ai.ipynb diff --git a/notebooks/code_samples/agents/agentic_ai_template.yaml b/notebooks/code_samples/agents/agentic_ai_template.yaml new file mode 100644 index 000000000..469df8ce2 --- /dev/null +++ b/notebooks/code_samples/agents/agentic_ai_template.yaml @@ -0,0 +1,339 @@ +- id: executive_summary + title: Executive Summary + guidelines: + - Provide a high-level overview of the agentic AI system, including its + purpose, scope, and intended use cases. + - Summarize the key features that make the system agentic, such as autonomy, + reasoning, memory, adaptability, and goal-directed behavior. + - Highlight the strategic benefits for the organization, such as efficiency, + scalability, cost-effectiveness, and decision-making support. + - Outline the system’s testing and validation strategy at a glance, + emphasizing safety, reliability, and regulatory compliance. + - Identify major risks, limitations, and safeguards, giving stakeholders a + concise understanding of governance and monitoring plans. + - Present the deployment vision, including expected stakeholders, + operational environments, and integration with existing workflows. + index_only: true +- id: conceptual_soundness + title: Conceptual Soundness + index_only: true + sections: + - id: model_overview + title: Model Overview + guidelines: + - Provide a concise explanation of the system’s purpose, including how + the agentic AI framework enables autonomous decision-making, + reasoning, and action-taking. + - Describe the high-level design of the agent(s), their core objectives, + and how they interact with their environment and users. + - Explain the conceptual differences between this agentic system and + traditional AI/ML models, focusing on autonomy, adaptability, and + emergent behavior. + - Highlight the role of agency, memory, feedback loops, and + goal-directedness in the system’s operation. + - Summarize the overall vision for how the system is intended to be + applied in real-world contexts, along with high-level testing goals. + parent_section: conceptual_soundness + - id: model_selection + title: Model Selection + guidelines: + - Describe the agentic AI paradigm, reasoning algorithms, or frameworks + chosen (e.g., reinforcement learning, planning, LLM-based + orchestration) and why they are suitable for the use case. + - Explain how the selected approach supports autonomy, adaptability, and + safe delegation of decision-making to the agent. + - Compare alternative paradigms (e.g., rule-based agents, purely + supervised ML models) and clarify why they were less appropriate. + - Discuss any hybrid approaches (e.g., combining symbolic reasoning with + generative models) and the rationale for customization. + - Identify potential risks and trade-offs of the chosen approach, + including known failure modes, and describe how these will be tested + and validated. + parent_section: conceptual_soundness + contents: + - content_id: model_selection + content_type: text + - id: purpose_and_scope + title: Purpose and Scope + guidelines: + - Clearly define the primary goals of the agentic AI system, including + decision-making domains and problem boundaries. + - Specify intended users, stakeholders, and environments where the agent + will operate. + - Identify the scope of autonomy granted to the agent (e.g., advisory + role, execution authority, or fully autonomous operation). + - Clarify the operational limits and scenarios where human oversight, + intervention, or escalation is required. + - Define measurable testing objectives that validate the agent’s + performance within its declared scope. + parent_section: conceptual_soundness + - id: architecture_at_glance + title: Architecture at Glance + guidelines: + - Provide a high-level diagram or description of the system + architecture, including agents, memory, reasoning modules, and + communication channels. + - Explain how the architecture supports perception, reasoning, planning, + and action loops. + - Highlight integration points with external systems, APIs, or data + sources. + - Describe the flow of information and control, showing how decisions + are formed, validated, and executed. + - Summarize testing hooks or checkpoints across components to enable + unit, integration, and system-level evaluation. + parent_section: conceptual_soundness + - id: assumptions_and_limitations + title: Assumptions and Limitations + guidelines: + - List the explicit assumptions about the environment, data, and user + behavior that underpin the system’s design. + - Identify constraints in agent reasoning, knowledge scope, or autonomy + that may affect performance. + - Discuss limitations in generalizability across contexts, domains, or + environments. + - Describe how uncertainty, incomplete information, or conflicting + objectives are handled. + - Explain how assumptions and limitations are validated through stress + tests, adversarial scenarios, and edge-case evaluations. + parent_section: conceptual_soundness + - id: regulatory_requirements + title: Regulatory Requirements + guidelines: + - Identify relevant laws, regulations, and standards applicable to + autonomous decision-making systems in the financial or operational + domain. + - Explain how the system addresses compliance needs such as + auditability, explainability, fairness, and accountability. + - Clarify how human oversight and control are integrated to meet + regulatory expectations for autonomous AI. + - Highlight any specific documentation, logging, or reporting features + built into the system for compliance purposes. + - Describe testing procedures to validate regulatory compliance, + including audit trail verification and explainability checks. + parent_section: conceptual_soundness +- id: data_preparation + title: Data Evaluation + index_only: true + sections: + - id: data_description + title: Data Description + guidelines: + - Provide an overview of data sources used by the agent(s), including + structured, unstructured, streaming, or interaction-derived data. + - Describe how contextual, environmental, or feedback data is + incorporated into the agent’s reasoning processes. + - Explain how memory structures (short-term, long-term, episodic) depend + on or interact with data inputs. + - Detail preprocessing or feature engineering tailored to enable + reasoning, planning, or adaptation. + - Include validation procedures to confirm data relevance, + representativeness, and adequacy for agent training and testing. + parent_section: data_preparation + - id: data_quality + title: Data Quality + guidelines: + - Define quality requirements for agent inputs, including accuracy, + timeliness, and consistency of real-world data streams. + - Describe methods for detecting and handling incomplete, noisy, or + adversarial data. + - Explain quality control for interaction data (e.g., user prompts, + feedback) that may shape agent behavior. + - Highlight processes for maintaining integrity of memory stores and + preventing drift due to poor input quality. + - Include testing protocols for validating data pipelines, + stress-testing with edge cases, and detecting bias leakage. + parent_section: data_preparation + contents: [] +- id: model_evaluation + title: Model Evaluation + index_only: true + sections: + - id: model_description + title: Model Description + guidelines: + - Provide a clear description of the agent’s architecture, reasoning + cycle, and interaction model. + - Explain the roles of planning, memory, and feedback in enabling + autonomy and adaptability. + - Detail how subcomponents (e.g., LLMs, planners, evaluators) integrate + to achieve end-to-end functionality. + - Clarify how emergent behaviors are monitored and managed. + - Specify test coverage for each component, including unit tests, + integration tests, and system-level tests. + parent_section: model_evaluation + - id: evaluation_methodology + title: Evaluation Methodology + guidelines: + - Describe the evaluation framework for testing autonomy, adaptability, + and goal alignment. + - Specify metrics for reasoning quality, task success, efficiency, and + safety. + - Explain simulation, sandboxing, or staged deployment approaches used + for testing. + - Include stress-testing for unexpected inputs, adversarial prompts, or + dynamic environments. + - Define reproducibility and benchmarking protocols to validate results + consistently across test cycles. + parent_section: model_evaluation + - id: prompt_evaluation + title: Prompt Evaluation + guidelines: + - Describe how the system’s responses to prompts are evaluated for + relevance, accuracy, and safety. + - Explain methods for detecting prompt injection, manipulation, or + adversarial use. + - Detail how evaluation ensures robustness against ambiguous, + conflicting, or incomplete instructions. + - Clarify criteria for determining when escalation to human oversight is + required. + - Define testing strategies for prompt templates, prompt chaining, and + stress scenarios. + contents: + - content_type: test + content_id: validmind.prompt_validation.Clarity + - content_type: test + content_id: validmind.prompt_validation.Conciseness + - content_type: test + content_id: validmind.prompt_validation.Delimitation + - content_type: test + content_id: validmind.prompt_validation.NegativeInstruction + - content_type: test + content_id: validmind.prompt_validation.Specificity + parent_section: model_evaluation + - id: agent_evaluation + title: Agent Evaluation + guidelines: + - Provide methods for assessing the agent’s ability to reason, plan, and + act autonomously. + - Define success metrics such as goal completion rate, adaptability to + change, and alignment with human intent. + - Explain how unintended or emergent behaviors are identified and + evaluated. + - Include testing for multi-agent interactions, collaboration, or + conflict resolution. + - Describe adversarial and edge-case testing to validate resilience of + autonomous decision-making. + contents: + - content_type: test + content_id: my_custom_tests.banking_accuracy_test + - content_type: test + content_id: my_custom_tests.BankingToolCallAccuracy + parent_section: model_evaluation + - id: output_quality + title: Output Quality + guidelines: + - Define quality standards for agent outputs (e.g., recommendations, + actions, reports). + - Evaluate outputs for consistency, accuracy, and contextual + appropriateness. + - Assess outputs for fairness, non-discrimination, and alignment with + ethical principles. + - Include processes for handling uncertainty or probabilistic reasoning + in outputs. + - Develop automated test suites to benchmark output quality against gold + standards or domain experts. + contents: + - content_type: test + content_id: validmind.model_validation.ragas.Faithfulness + - content_type: test + content_id: validmind.model_validation.ragas.ResponseRelevancy + - content_type: test + content_id: validmind.model_validation.ragas.ContextRecall + parent_section: model_evaluation + - id: Safety + title: Safety + guidelines: + - Describe built-in safety mechanisms to prevent harmful or unintended + actions by the agent. + - Explain escalation protocols for high-risk decisions requiring human + oversight. + - Detail adversarial robustness testing and red-teaming efforts to + uncover vulnerabilities. + - Clarify methods for ensuring alignment with ethical, legal, and + organizational safety standards. + - Include continuous validation tests for safety boundaries under + evolving data and environment conditions. + contents: + - content_type: test + content_id: validmind.model_validation.ragas.AspectCritic + - content_type: test + content_id: validmind.prompt_validation.Bias + - content_type: test + content_id: validmind.data_validation.nlp.Toxicity + parent_section: model_evaluation + - id: reliability_resilience_and_degraded_modes + title: Reliability, Resilience and Degraded Modes + guidelines: + - Explain strategies to ensure continuity of service during system or + environment disruptions. + - Describe fallback behaviors, degraded modes, or safe defaults when + full autonomy is not possible. + - Detail resilience mechanisms for handling network, data, or + computational failures. + - Provide monitoring methods for detecting and recovering from system + instability or drift. + - Define test scenarios simulating degraded conditions to validate + graceful failure and recovery. + parent_section: model_evaluation + - id: c46a7162-5fcd-4d2f-87e2-084afae70ee9 + title: Actor specific Results + parent_section: model_evaluation + contents: [] + sections: + - id: e78c8564-5af1-4ecc-b200-f131a629a01c + title: Credit Risk Analyzer + parent_section: c46a7162-5fcd-4d2f-87e2-084afae70ee9 + contents: [] + - id: df36a0c3-be44-4e16-a59a-cb635eac3ff3 + title: Customer Account Manager + parent_section: c46a7162-5fcd-4d2f-87e2-084afae70ee9 + contents: [] + - id: 67d25cc5-2569-4727-aae1-6c5b2f84e238 + title: Fraud Detection System + parent_section: c46a7162-5fcd-4d2f-87e2-084afae70ee9 + contents: [] + - id: cost_and_performance_management + title: Cost and Performance Management + guidelines: + - Provide metrics for computational efficiency, resource utilization, + and scalability of the system. + - Explain trade-offs between autonomy, performance, and resource + consumption. + - Detail monitoring of infrastructure costs, particularly in multi-agent + or large-scale deployments. + - Describe optimization strategies for balancing responsiveness with + efficiency. + - Include load testing, latency measurement, and profiling to validate + scalability and cost-effectiveness. + parent_section: model_evaluation +- id: observability_and_monitoring + title: Observability and Monitoring + index_only: true + sections: + - id: monitoring_plan + title: Monitoring Plan + guidelines: + - Describe monitoring practices for reasoning quality, autonomy + boundaries, and safety compliance. + - Define triggers or alerts for deviations in agent behavior, output + quality, or ethical alignment. + - Explain feedback mechanisms for continuous improvement, retraining, or + realignment. + - Detail governance processes overseeing the monitoring, including human + review cycles. + - Specify testing protocols for validating monitoring tools, anomaly + detection, and alert reliability. + parent_section: observability_and_monitoring + - id: remediation_plan + title: Remediation Plan + guidelines: + - Provide steps for addressing performance degradation, misalignment, or + unsafe behaviors. + - Define escalation protocols and roles for intervention when agent + behavior breaches acceptable limits. + - Describe rollback strategies to revert to prior safe versions or modes. + - Explain retraining or recalibration processes when monitoring + identifies issues. + - Include regular scenario-based testing to validate the effectiveness + of remediation and recovery procedures. + parent_section: observability_and_monitoring diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb new file mode 100644 index 000000000..7883d3701 --- /dev/null +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -0,0 +1,399 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "382caa31", + "metadata": {}, + "source": [ + "# Document an agentic AI system\n", + "\n", + "Build and document an agentic AI system with the ValidMind Library. Construct a LangGraph-based banking agent, prepare your datasets and agent for testing, run out-of-the-box and custom tests and log those test results to the ValidMind Platform.\n", + "\n", + "An _AI agent_ is an autonomous system that interprets inputs, selects from available tools or actions, and executes multi-step behaviors to achieve defined goals. In this notebook, the agent acts as a banking assistant that analyzes requests and automatically selects and invokes the appropriate specialized banking tool to deliver accurate, compliant, and actionable responses.\n", + "\n", + "- This agent enables financial institutions to automate complex banking workflows where different customer requests require different specialized tools and knowledge bases.\n", + "- Effective validation of agentic AI systems reduces the risks of agents misinterpreting inputs, failing to extract required parameters, or producing incorrect assessments or actions — such as selecting the wrong tool." + ] + }, + { + "cell_type": "markdown", + "id": "about-intro-d367bebc-fcd0-46df-93ba-5f731f4b9ba9", + "metadata": {}, + "source": [ + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. \n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators." + ] + }, + { + "cell_type": "markdown", + "id": "about-begin-c1d3d399-73ca-4a88-9ca8-16603eab0751", + "metadata": {}, + "source": [ + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html)." + ] + }, + { + "cell_type": "markdown", + "id": "about-signup-65fed156-8e69-49eb-b04e-ae52c574d8e7", + "metadata": {}, + "source": [ + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
" + ] + }, + { + "cell_type": "markdown", + "id": "about-concepts-f49034ef-bfb8-4643-88dc-5647eb76968a", + "metadata": {}, + "source": [ + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Metrics**: A subset of tests that do not have thresholds. In the context of this notebook, metrics and tests can be thought of as interchangeable concepts.\n", + "\n", + "**Custom metrics**: Custom metrics are functions that you define to evaluate your model or dataset. These functions can be registered with the ValidMind Library to be used in the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom metrics can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "id": "cell-d35096df-f3f8-4034-87fa-1973bbdd7b49-efbed180-2181-4735-83c9-7d12b69b9f4c", + "metadata": {}, + "source": [ + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "id": "install-library-20ef5375-d71d-47b1-b669-ed90cb3f0e90", + "metadata": {}, + "source": [ + "### Install the ValidMind Library\n", + "\n", + "
Recommended Python versions\n", + "

\n", + "Python 3.8 <= x <= 3.11
\n", + "\n", + "Let's begin by installing the ValidMind Library with LLM support:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-python-94668f4a-272a-4d93-b7c2-da735557fd37", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q \"validmind[llm]\" \"langgraph==0.3.21\"" + ] + }, + { + "cell_type": "markdown", + "id": "install-initialize-13c50d00-6167-4147-a886-541d73129898", + "metadata": {}, + "source": [ + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "id": "install-register-ab56529b-d2f2-4942-b18e-603fb7342e5b", + "metadata": {}, + "source": [ + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook.\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "id": "install-template-6e8d6279-0edf-442a-bb63-50f7f89e60b3", + "metadata": {}, + "source": [ + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Agentic AI`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "id": "e4a16ffa", + "metadata": {}, + "source": [ + "
Don't see this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "install-snippet-468eb7c3-612b-4f14-96aa-dc5a3088448c", + "metadata": {}, + "source": [ + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-init-9b9b870b-01e8-428d-afbf-8e3d9771b3f4", + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "install-preview-4e403693-e030-4b31-9d39-74a6af8f470f", + "metadata": {}, + "source": [ + "### Preview the documentation template\n", + "\n", + "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", + "\n", + "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-preview-template-b5149386-4ab1-41cb-9527-880573d91509", + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "id": "679111bb", + "metadata": {}, + "source": [ + "### Initialize the Python environment\n", + "\n", + "Next, let's import all the necessary libraries for building our banking LangGraph agentic system:\n", + "\n", + "- **Standard libraries** for data handling and environment management.\n", + "- **pandas**, a Python library for data manipulation and analytics, as an alias. We'll also configure pandas to show all columns and all rows at full width for easier debugging and inspection.\n", + "- **LangChain components** for LLM integration and tool management.\n", + "- **LangGraph** for building stateful, multi-step agent workflows.\n", + "- **Banking tools** for specialized financial services.\n", + "\n", + "The setup includes loading environment variables (like OpenAI API keys) needed for the LLM components to function properly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a64a021", + "metadata": {}, + "outputs": [], + "source": [ + "# Standard library imports\n", + "from typing import TypedDict, Annotated, Sequence\n", + "\n", + "# Third party imports\n", + "import pandas as pd\n", + "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", + "from langchain_openai import ChatOpenAI\n", + "from langgraph.checkpoint.memory import MemorySaver\n", + "from langgraph.graph import StateGraph, END, START\n", + "from langgraph.graph.message import add_messages\n", + "from langgraph.prebuilt import ToolNode\n", + "\n", + "# Local imports\n", + "from banking_tools import AVAILABLE_TOOLS\n", + "\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.width', None)\n", + "pd.set_option('display.max_rows', None)" + ] + }, + { + "cell_type": "markdown", + "id": "next-steps-24e25294-3e59-4982-92d0-a998cfbe3bb5", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "You can look at the output produced by the ValidMind Library right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation." + ] + }, + { + "cell_type": "markdown", + "id": "next-docs-7773f25a-ef1c-40d1-9b4b-f6cecdf7dc6c", + "metadata": {}, + "source": [ + "### Work with your model documentation\n", + "\n", + "1. From the **Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", + "\n", + "2. In the left sidebar that appears for your model, click **Documentation** under Documents.\n", + "\n", + "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)" + ] + }, + { + "cell_type": "markdown", + "id": "next-resources-779f3903-eb13-4242-9438-e0dfc1753d4d", + "metadata": {}, + "source": [ + "### Discover more learning resources\n", + "\n", + "We offer many interactive notebooks to help you document models:\n", + "\n", + "- [Run tests & test suites](https://docs.validmind.ai/guide/testing-overview.html)\n", + "- [Code samples](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-vm-362dbd51-d699-405b-b817-f95381861976", + "metadata": {}, + "source": [ + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "upgrade-show-2ce32ad2-2751-4b40-a1ec-161b178603b1", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-version-61359c61-7218-4a90-9f6f-70794c42d997", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-restart-c45548b7-aee3-4b96-b412-37c781071787", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-99bac407-03cd-4fe3-83c8-0a280d4caa64", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/code_sharing/document_agentic_ai.ipynb b/notebooks/code_sharing/document_agentic_ai.ipynb deleted file mode 100644 index 4c1f3e5a3..000000000 --- a/notebooks/code_sharing/document_agentic_ai.ipynb +++ /dev/null @@ -1,339 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "382caa31", - "metadata": {}, - "source": [ - "# Document an agentic AI system\n", - "\n", - "Build and document an agentic AI system with the ValidMind Library. Construct a LangGraph-based banking agent, prepare your datasets and agent for testing, run out-of-the-box and custom tests and log those test results to the ValidMind Platform.\n", - "\n", - "An _AI agent_ is an autonomous system that interprets inputs, selects from available tools or actions, and executes multi-step behaviors to achieve defined goals. In this notebook, the agent acts as a banking assistant that analyzes requests and automatically selects and invokes the appropriate specialized banking tool to deliver accurate, compliant, and actionable responses.\n", - "\n", - "- This agent enables financial institutions to automate complex banking workflows where different customer requests require different specialized tools and knowledge bases.\n", - "- Effective validation of agentic AI systems reduces the risks of agents misinterpreting inputs, failing to extract required parameters, or producing incorrect assessments or actions — such as selecting the wrong tool." - ] - }, - { - "cell_type": "markdown", - "id": "about-intro-d367bebc-fcd0-46df-93ba-5f731f4b9ba9", - "metadata": {}, - "source": [ - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. \n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators." - ] - }, - { - "cell_type": "markdown", - "id": "about-begin-c1d3d399-73ca-4a88-9ca8-16603eab0751", - "metadata": {}, - "source": [ - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html)." - ] - }, - { - "cell_type": "markdown", - "id": "about-signup-65fed156-8e69-49eb-b04e-ae52c574d8e7", - "metadata": {}, - "source": [ - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
" - ] - }, - { - "cell_type": "markdown", - "id": "about-concepts-f49034ef-bfb8-4643-88dc-5647eb76968a", - "metadata": {}, - "source": [ - "### Key concepts\n", - "\n", - "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", - "\n", - "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Metrics**: A subset of tests that do not have thresholds. In the context of this notebook, metrics and tests can be thought of as interchangeable concepts.\n", - "\n", - "**Custom metrics**: Custom metrics are functions that you define to evaluate your model or dataset. These functions can be registered with the ValidMind Library to be used in the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Custom metrics can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", - "\n", - "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." - ] - }, - { - "cell_type": "markdown", - "id": "cell-d35096df-f3f8-4034-87fa-1973bbdd7b49-efbed180-2181-4735-83c9-7d12b69b9f4c", - "metadata": {}, - "source": [ - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "id": "install-library-20ef5375-d71d-47b1-b669-ed90cb3f0e90", - "metadata": {}, - "source": [ - "### Install the ValidMind Library\n", - "\n", - "
Recommended Python versions\n", - "

\n", - "Python 3.8 <= x <= 3.11
\n", - "\n", - "To install the library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "install-python-94668f4a-272a-4d93-b7c2-da735557fd37", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q validmind" - ] - }, - { - "cell_type": "markdown", - "id": "install-initialize-13c50d00-6167-4147-a886-541d73129898", - "metadata": {}, - "source": [ - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "id": "install-register-ab56529b-d2f2-4942-b18e-603fb7342e5b", - "metadata": {}, - "source": [ - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook.\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "id": "install-template-6e8d6279-0edf-442a-bb63-50f7f89e60b3", - "metadata": {}, - "source": [ - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Agentic AI`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "id": "install-snippet-468eb7c3-612b-4f14-96aa-dc5a3088448c", - "metadata": {}, - "source": [ - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "install-init-9b9b870b-01e8-428d-afbf-8e3d9771b3f4", - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "install-preview-4e403693-e030-4b31-9d39-74a6af8f470f", - "metadata": {}, - "source": [ - "### Preview the documentation template\n", - "\n", - "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", - "\n", - "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "install-preview-template-b5149386-4ab1-41cb-9527-880573d91509", - "metadata": {}, - "outputs": [], - "source": [ - "vm.preview_template()" - ] - }, - { - "cell_type": "markdown", - "id": "next-steps-24e25294-3e59-4982-92d0-a998cfbe3bb5", - "metadata": {}, - "source": [ - "## Next steps\n", - "\n", - "You can look at the output produced by the ValidMind Library right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation." - ] - }, - { - "cell_type": "markdown", - "id": "next-docs-7773f25a-ef1c-40d1-9b4b-f6cecdf7dc6c", - "metadata": {}, - "source": [ - "### Work with your model documentation\n", - "\n", - "1. From the **Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", - "\n", - "2. In the left sidebar that appears for your model, click **Documentation** under Documents.\n", - "\n", - "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)" - ] - }, - { - "cell_type": "markdown", - "id": "next-resources-779f3903-eb13-4242-9438-e0dfc1753d4d", - "metadata": {}, - "source": [ - "### Discover more learning resources\n", - "\n", - "We offer many interactive notebooks to help you document models:\n", - "\n", - "- [Run tests & test suites](https://docs.validmind.ai/guide/testing-overview.html)\n", - "- [Code samples](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", - "\n", - "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." - ] - }, - { - "cell_type": "markdown", - "id": "upgrade-vm-362dbd51-d699-405b-b817-f95381861976", - "metadata": {}, - "source": [ - "## Upgrade ValidMind\n", - "\n", - "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", - "\n", - "Retrieve the information for the currently installed version of ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "upgrade-show-2ce32ad2-2751-4b40-a1ec-161b178603b1", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "%pip show validmind" - ] - }, - { - "cell_type": "markdown", - "id": "upgrade-version-61359c61-7218-4a90-9f6f-70794c42d997", - "metadata": {}, - "source": [ - "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", - "\n", - "```bash\n", - "%pip install --upgrade validmind\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "upgrade-restart-c45548b7-aee3-4b96-b412-37c781071787", - "metadata": {}, - "source": [ - "You may need to restart your kernel after running the upgrade package for changes to be applied." - ] - }, - { - "cell_type": "markdown", - "id": "copyright-99bac407-03cd-4fe3-83c8-0a280d4caa64", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 49ebc39c1a527b639511920cc6ce40921e8375d1 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 13:24:51 -0800 Subject: [PATCH 04/54] Save point --- .../agents/document_agentic_ai.ipynb | 37 ++++++++++++++++--- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 7883d3701..27a8a6a84 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -218,9 +218,9 @@ "id": "install-preview-4e403693-e030-4b31-9d39-74a6af8f470f", "metadata": {}, "source": [ - "### Preview the documentation template\n", + "#### Preview the documentation template\n", "\n", - "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", + "Now, let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", "\n", "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" ] @@ -235,6 +235,33 @@ "vm.preview_template()" ] }, + { + "cell_type": "markdown", + "id": "ba45feba", + "metadata": {}, + "source": [ + "### Verify OpenAI API access\n", + "\n", + "For the LLM components in this notebook to function properly, you'll need access to OpenAI.\n", + "\n", + "Before continuing, make sure that a valid `OPENAI_API_KEY` is set in your `.env` file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9684fde1", + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables if using .env file\n", + "try:\n", + " from dotenv import load_dotenv\n", + " load_dotenv()\n", + "except ImportError:\n", + " print(\"dotenv not installed. Make sure OPENAI_API_KEY is set in your environment.\")" + ] + }, { "cell_type": "markdown", "id": "679111bb", @@ -246,11 +273,9 @@ "\n", "- **Standard libraries** for data handling and environment management.\n", "- **pandas**, a Python library for data manipulation and analytics, as an alias. We'll also configure pandas to show all columns and all rows at full width for easier debugging and inspection.\n", - "- **LangChain components** for LLM integration and tool management.\n", + "- **LangChain** components for LLM integration and tool management.\n", "- **LangGraph** for building stateful, multi-step agent workflows.\n", - "- **Banking tools** for specialized financial services.\n", - "\n", - "The setup includes loading environment variables (like OpenAI API keys) needed for the LLM components to function properly." + "- **Banking tools** for specialized financial services." ] }, { From fdaec52c322a931ced79ed3c217cb076747202b7 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 13:39:35 -0800 Subject: [PATCH 05/54] Save point --- .../agents/document_agentic_ai.ipynb | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 27a8a6a84..8115084d1 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -167,7 +167,7 @@ "id": "e4a16ffa", "metadata": {}, "source": [ - "
Don't see this template?\n", + "
Can't select this template?\n", "

\n", "Your organization administrators may need to add it to your template library:\n", "
\n", "\n", - "Let's begin by installing the ValidMind Library with LLM support:" + "Let's begin by installing the ValidMind Library with large language model (LLM) support:" ] }, { @@ -452,23 +452,32 @@ "source": [ "### Create LangGraph banking agent\n", "\n", - "Then, we'll create our intelligent banking agent with LangGraph that automatically selects and uses the appropriate banking tool based on a request:" + "Then, we'll create our intelligent banking agent with LangGraph that automatically selects and uses the appropriate banking tool based on a user request." + ] + }, + { + "cell_type": "markdown", + "id": "6a5beb28", + "metadata": {}, + "source": [ + "#### Define the system prompt\n", + "\n", + "We'll begin by defining our system prompt, which provides the LLM with context about its role as a banking assistant and guidance on when to use each available tool:" ] }, { "cell_type": "code", "execution_count": null, - "id": "6a5beb28", + "id": "64f46a1c", "metadata": {}, "outputs": [], "source": [ - "\n", "# Enhanced banking system prompt with tool selection guidance\n", "system_context = \"\"\"You are a professional banking AI assistant with access to specialized banking tools.\n", " Analyze the user's banking request and directly use the most appropriate tools to help them.\n", - " \n", + "\n", " AVAILABLE BANKING TOOLS:\n", - " \n", + "\n", " credit_risk_analyzer - Analyze credit risk for loan applications and credit decisions\n", " - Use for: loan applications, credit assessments, risk analysis, mortgage eligibility\n", " - Examples: \"Analyze credit risk for $50k personal loan\", \"Assess mortgage eligibility for $300k home purchase\"\n", @@ -496,7 +505,26 @@ " - Be professional and thorough in your analysis\n", "\n", " Choose and use tools wisely to provide the most helpful banking assistance.\n", - " \"\"\"\n", + " \"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "3f8da88d", + "metadata": {}, + "source": [ + "#### Initialize the LLM\n", + "\n", + "Let's initialize the LLM that will power our banking agent:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b828d70", + "metadata": {}, + "outputs": [], + "source": [ "# Initialize the main LLM for banking responses\n", "main_llm = ChatOpenAI(\n", " model=\"gpt-5-mini\",\n", @@ -504,17 +532,82 @@ " \"effort\": \"low\",\n", " \"summary\": \"auto\"\n", " }\n", - ")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "866b59cb", + "metadata": {}, + "source": [ + "Then bind the available banking tools to the LLM, enabling the model to automatically recognize and invoke each tool when appropriate based on request input and the system prompt we defined above:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65f85d86", + "metadata": {}, + "outputs": [], + "source": [ "# Bind all banking tools to the main LLM\n", - "llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)\n", + "llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)" + ] + }, + { + "cell_type": "markdown", + "id": "5f898062", + "metadata": {}, + "source": [ + "#### Define the agent state structure\n", + "\n", + "The agent state defines the data structure that flows through the LangGraph workflow. It includes:\n", + "\n", + "- **messages** — The conversation history between the user and agent\n", + "- **user_input** — The current user request\n", + "- **session_id** — A unique identifier for the conversation session\n", + "- **context** — Additional context that can be passed between nodes\n", "\n", + "Defining this state structure maintains the structure throughout the agent's execution and allows for multi-turn conversations with memory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7feeebb", + "metadata": {}, + "outputs": [], + "source": [ "# Banking Agent State Definition\n", "class BankingAgentState(TypedDict):\n", " messages: Annotated[Sequence[BaseMessage], add_messages]\n", " user_input: str\n", " session_id: str\n", - " context: dict\n", + " context: dict" + ] + }, + { + "cell_type": "markdown", + "id": "31b261b2", + "metadata": {}, + "source": [ + "#### Create the agent workflow function\n", + "\n", + "We'll build the LangGraph agent workflow with two main components:\n", "\n", + "1. **LLM node** — Processes user requests, applies the system prompt, and decides whether to use tools.\n", + "2. **Tools node** — Executes the selected banking tools when the LLM determines they're needed.\n", + "\n", + "The workflow begins with the LLM analyzing the request, then uses tools if needed — or ends if the response is complete, and finally returns to the LLM to generate the final response. The agent will also use memory to maintain conversation context across multiple interactions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "142c20b9", + "metadata": {}, + "outputs": [], + "source": [ "def create_banking_langgraph_agent():\n", " \"\"\"Create a comprehensive LangGraph banking agent with intelligent tool selection.\"\"\"\n", " def llm_node(state: BankingAgentState) -> BankingAgentState:\n", @@ -556,8 +649,28 @@ " memory = MemorySaver()\n", " # Compile the graph\n", " agent = workflow.compile(checkpointer=memory)\n", - " return agent\n", + " return agent" + ] + }, + { + "cell_type": "markdown", + "id": "0f19d4b7", + "metadata": {}, + "source": [ + "#### Instantiate the banking agent\n", + "\n", + "Now, we'll create an instance of the banking agent by calling the workflow creation function.\n", "\n", + "This compiled agent is ready to process banking requests and will automatically select and use the appropriate tools based on user queries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aabb7842", + "metadata": {}, + "outputs": [], + "source": [ "# Create the banking intelligent agent\n", "banking_agent = create_banking_langgraph_agent()\n", "\n", From 319df4075e9aca7ba3a775b92a8d9a742b7d41e4 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 14:22:35 -0800 Subject: [PATCH 10/54] Save point --- .../agents/document_agentic_ai.ipynb | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index e167d4b5b..ce0a2403c 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -398,7 +398,16 @@ "except Exception as e:\n", " print(f\"Credit Risk Analyzer test FAILED: {e}\")\n", "\n", - "print(\"\" + \"=\" * 60)\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6b227db", + "metadata": {}, + "outputs": [], + "source": [ "\n", "# Test 2: Customer Account Manager\n", "print(\"TEST 2: Customer Account Manager\")\n", @@ -423,7 +432,16 @@ "except Exception as e:\n", " print(f\"Customer Account Manager test FAILED: {e}\")\n", "\n", - "print(\"\" + \"=\" * 60)\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8442bf81", + "metadata": {}, + "outputs": [], + "source": [ "\n", "# Test 3: Fraud Detection System\n", "print(\"TEST 3: Fraud Detection System\")\n", From 8fb6df0b03716195a2a67e4a0fc656ace6d26527 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 14:26:28 -0800 Subject: [PATCH 11/54] Save point --- notebooks/code_samples/agents/document_agentic_ai.ipynb | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index ce0a2403c..2992e66b0 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -378,9 +378,6 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"Testing Individual Banking Tools\")\n", - "print(\"=\" * 60)\n", - "\n", "# Test 1: Credit Risk Analyzer\n", "print(\"TEST 1: Credit Risk Analyzer\")\n", "print(\"-\" * 40)\n", @@ -470,7 +467,7 @@ "source": [ "### Create LangGraph banking agent\n", "\n", - "Then, we'll create our intelligent banking agent with LangGraph that automatically selects and uses the appropriate banking tool based on a user request." + "With our tools ready to go, we'll create our intelligent banking agent with LangGraph that automatically selects and uses the appropriate banking tool based on a user request." ] }, { @@ -586,7 +583,7 @@ "- **session_id** — A unique identifier for the conversation session\n", "- **context** — Additional context that can be passed between nodes\n", "\n", - "Defining this state structure maintains the structure throughout the agent's execution and allows for multi-turn conversations with memory." + "Defining this state structure maintains the structure throughout the agent's execution and allows for multi-turn conversations with memory:" ] }, { From ae9c3b94f3662705f39d50d969badb05972315d4 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 14:29:03 -0800 Subject: [PATCH 12/54] Save point --- .../agents/document_agentic_ai.ipynb | 110 ++++++++++++++++-- 1 file changed, 103 insertions(+), 7 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 2992e66b0..16b33b25f 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -703,18 +703,56 @@ "id": "cfd302bb", "metadata": {}, "source": [ - "### Integrate agent with ValidMind" + "### Integrate agent with ValidMind\n", + "\n", + "To integrate our LangGraph banking agent with ValidMind, we need to create a wrapper function that ValidMind can use to invoke the agent and extract the necessary information for testing and documentation. This integration allows ValidMind to run validation tests on the agent's behavior, tool usage, and responses." + ] + }, + { + "cell_type": "markdown", + "id": "e2540236", + "metadata": {}, + "source": [ + "#### Import ValidMind integration components\n", + "\n", + "We'll import the necessary ValidMind components to integrate our agent:\n", + "- `Prompt` - Used to store the system prompt template for documentation\n", + "- `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list` - Utilities to extract and format tool call information from the agent's output, which is needed for RAGAS-based evaluation tests" ] }, { "cell_type": "code", "execution_count": null, - "id": "cb5d72c0", + "id": "67557905", "metadata": {}, "outputs": [], "source": [ "from validmind.models import Prompt\n", - "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list\n", + "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list" + ] + }, + { + "cell_type": "markdown", + "id": "c30dd6b1", + "metadata": {}, + "source": [ + "#### Create the agent wrapper function\n", + "\n", + "ValidMind requires a standardized function interface to invoke and test models. We'll create a wrapper function that:\n", + "- Accepts input in ValidMind's expected format (with `input` and `session_id` fields)\n", + "- Invokes the banking agent with the proper state initialization\n", + "- Captures tool outputs and tool calls for evaluation\n", + "- Returns a standardized response format that includes the prediction, full output, tool messages, and tool call information\n", + "- Handles errors gracefully with fallback responses" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db1fcc20", + "metadata": {}, + "outputs": [], + "source": [ "def banking_agent_fn(input):\n", " \"\"\"\n", " Invoke the banking agent with the given input.\n", @@ -770,18 +808,76 @@ " \"messages\": [HumanMessage(content=input[\"input\"]), SystemMessage(content=error_message)],\n", " \"error\": str(e)\n", " }\n", - " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "4ea44f1e", + "metadata": {}, + "source": [ + "#### Initialize the ValidMind model\n", "\n", - "## Initialize the model\n", + "Now we'll register the banking agent as a ValidMind model using `vm.init_model()`. This creates a ValidMind model object that:\n", + "- Associates the wrapper function with the model for prediction\n", + "- Stores the system prompt template for documentation\n", + "- Provides a unique `input_id` for tracking and identification\n", + "- Enables the agent to be used with ValidMind's testing and documentation features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4389e36", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the model\n", "vm_banking_model = vm.init_model(\n", " input_id=\"banking_agent_model\",\n", " predict_fn=banking_agent_fn,\n", " prompt=Prompt(template=system_context)\n", - ")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cd6eb68b", + "metadata": {}, + "source": [ + "#### Store the agent reference\n", "\n", + "We'll also store a reference to the original banking agent object in the ValidMind model. This allows us to access the full agent functionality directly if needed, while still maintaining the wrapper function interface for ValidMind's testing framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e39d400", + "metadata": {}, + "outputs": [], + "source": [ "# Add the banking agent to the vm model\n", - "vm_banking_model.model = banking_agent\n", + "vm_banking_model.model = banking_agent" + ] + }, + { + "cell_type": "markdown", + "id": "2db4b849", + "metadata": {}, + "source": [ + "#### Verify integration\n", "\n", + "Let's confirm that the banking agent has been successfully integrated with ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59afbb6d", + "metadata": {}, + "outputs": [], + "source": [ "print(\"Banking Agent Successfully Integrated with ValidMind!\")\n", "print(f\"Model ID: {vm_banking_model.input_id}\")" ] From 569045e845acb4ca9dce18c2c6f80fb83544a8e6 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 14:55:22 -0800 Subject: [PATCH 13/54] Save point --- .../code_samples/agents/document_agentic_ai.ipynb | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 16b33b25f..e87926f7b 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -705,7 +705,7 @@ "source": [ "### Integrate agent with ValidMind\n", "\n", - "To integrate our LangGraph banking agent with ValidMind, we need to create a wrapper function that ValidMind can use to invoke the agent and extract the necessary information for testing and documentation. This integration allows ValidMind to run validation tests on the agent's behavior, tool usage, and responses." + "To integrate our LangGraph banking agent with ValidMind, we need to create a wrapper function that ValidMind can use to invoke the agent and extract the necessary information for testing and documentation, allowing ValidMind to run validation tests on the agent's behavior, tool usage, and responses." ] }, { @@ -715,9 +715,7 @@ "source": [ "#### Import ValidMind integration components\n", "\n", - "We'll import the necessary ValidMind components to integrate our agent:\n", - "- `Prompt` - Used to store the system prompt template for documentation\n", - "- `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list` - Utilities to extract and format tool call information from the agent's output, which is needed for RAGAS-based evaluation tests" + "We'll import the `deepeval` module from `validmind.scorers.llm` to access tool call extraction utilities via `vm.scorers.llm.deepeval`, ValidMind components necessary for integrating our agent:" ] }, { @@ -727,8 +725,8 @@ "metadata": {}, "outputs": [], "source": [ - "from validmind.models import Prompt\n", - "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list" + "# Import the deepeval module to access tool call extraction utilities via vm.scorers.llm.deepeval\n", + "import vm.scorers.llm.deepeval" ] }, { From 18d702c6dc9962c0004e503a148de5bbf9798e77 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 15:03:49 -0800 Subject: [PATCH 14/54] Save point --- notebooks/code_samples/agents/document_agentic_ai.ipynb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index e87926f7b..fd6bf8fea 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -715,7 +715,7 @@ "source": [ "#### Import ValidMind integration components\n", "\n", - "We'll import the `deepeval` module from `validmind.scorers.llm` to access tool call extraction utilities via `vm.scorers.llm.deepeval`, ValidMind components necessary for integrating our agent:" + "We'll import the `deepeval` module from `validmind.scorers.llm` to access tool call extraction utilities like `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list`, ValidMind components necessary for integrating our agent:" ] }, { @@ -725,8 +725,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Import the deepeval module to access tool call extraction utilities via vm.scorers.llm.deepeval\n", - "import vm.scorers.llm.deepeval" + "import validmind.scorers.llm.deepeval" ] }, { From 6ae88669f2b9a4196b86a49a7bba4e0d4484924d Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 15:13:36 -0800 Subject: [PATCH 15/54] Save point --- .../code_samples/agents/document_agentic_ai.ipynb | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index fd6bf8fea..58bea1bb6 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -713,7 +713,7 @@ "id": "e2540236", "metadata": {}, "source": [ - "#### Import ValidMind integration components\n", + "#### Import ValidMind scorers\n", "\n", "We'll import the `deepeval` module from `validmind.scorers.llm` to access tool call extraction utilities like `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list`, ValidMind components necessary for integrating our agent:" ] @@ -813,9 +813,13 @@ "id": "4ea44f1e", "metadata": {}, "source": [ - "#### Initialize the ValidMind model\n", + "#### Initialize the ValidMind model object\n", + "\n", + "\n", + "We'll also need to register the banking agent as a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data.\n", + "\n", + "You simply initialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model) that:\n", "\n", - "Now we'll register the banking agent as a ValidMind model using `vm.init_model()`. This creates a ValidMind model object that:\n", "- Associates the wrapper function with the model for prediction\n", "- Stores the system prompt template for documentation\n", "- Provides a unique `input_id` for tracking and identification\n", @@ -829,7 +833,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Initialize the model\n", + "# Initialize the agent as a model\n", "vm_banking_model = vm.init_model(\n", " input_id=\"banking_agent_model\",\n", " predict_fn=banking_agent_fn,\n", From bc188bb2d8db108536e3dc5bc739dc52b2ecf6f6 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 15:22:35 -0800 Subject: [PATCH 16/54] Clarifying OpenAI access --- notebooks/code_samples/agents/document_agentic_ai.ipynb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 58bea1bb6..29890e187 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -12,7 +12,11 @@ "An _AI agent_ is an autonomous system that interprets inputs, selects from available tools or actions, and executes multi-step behaviors to achieve defined goals. In this notebook, the agent acts as a banking assistant that analyzes user requests and automatically selects and invokes the appropriate specialized banking tool to deliver accurate, compliant, and actionable responses.\n", "\n", "- This agent enables financial institutions to automate complex banking workflows where different customer requests require different specialized tools and knowledge bases.\n", - "- Effective validation of agentic AI systems reduces the risks of agents misinterpreting inputs, failing to extract required parameters, or producing incorrect assessments or actions — such as selecting the wrong tool." + "- Effective validation of agentic AI systems reduces the risks of agents misinterpreting inputs, failing to extract required parameters, or producing incorrect assessments or actions — such as selecting the wrong tool.\n", + "\n", + "
For the LLM components in this notebook to function properly, you'll need access to OpenAI.\n", + "

\n", + "Before you continue, ensure that a valid OPENAI_API_KEY is set in your .env file.
" ] }, { From 2f7bca0c57039d0252678ca46f7648198c15b7c5 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 15:33:57 -0800 Subject: [PATCH 17/54] Save point --- .../agents/document_agentic_ai.ipynb | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 29890e187..3d0c9b749 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -345,7 +345,7 @@ "id": "bf4fc0d7", "metadata": {}, "source": [ - "### Test the available banking tools\n", + "### Test available banking tools\n", "\n", "We'll use the demo banking tools defined in `banking_tools.py` that provide use cases of financial services:\n", "\n", @@ -479,7 +479,7 @@ "id": "6a5beb28", "metadata": {}, "source": [ - "#### Define the system prompt\n", + "#### Define system prompt\n", "\n", "We'll begin by defining our system prompt, which provides the LLM with context about its role as a banking assistant and guidance on when to use each available tool:" ] @@ -578,7 +578,7 @@ "id": "5f898062", "metadata": {}, "source": [ - "#### Define the agent state structure\n", + "#### Define agent state structure\n", "\n", "The agent state defines the data structure that flows through the LangGraph workflow. It includes:\n", "\n", @@ -610,7 +610,7 @@ "id": "31b261b2", "metadata": {}, "source": [ - "#### Create the agent workflow function\n", + "#### Create agent workflow function\n", "\n", "We'll build the LangGraph agent workflow with two main components:\n", "\n", @@ -717,9 +717,12 @@ "id": "e2540236", "metadata": {}, "source": [ - "#### Import ValidMind scorers\n", + "#### Import ValidMind components\n", "\n", - "We'll import the `deepeval` module from `validmind.scorers.llm` to access tool call extraction utilities like `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list`, ValidMind components necessary for integrating our agent:" + "We'll start with importing the necessary ValidMind components for integrating our agent:\n", + "\n", + "- `Prompt` from `validmind.models` for handling prompt-based model inputs\n", + "- `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list` from `validmind.scorers.llm.deepeval` for extracting and converting tool calls from agent outputs" ] }, { @@ -729,7 +732,8 @@ "metadata": {}, "outputs": [], "source": [ - "import validmind.scorers.llm.deepeval" + "from validmind.models import Prompt\n", + "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list" ] }, { @@ -737,9 +741,10 @@ "id": "c30dd6b1", "metadata": {}, "source": [ - "#### Create the agent wrapper function\n", + "#### Create agent wrapper function\n", + "\n", + "As ValidMind requires a standardized function interface to invoke and test models, we'll then create a wrapper function that:\n", "\n", - "ValidMind requires a standardized function interface to invoke and test models. We'll create a wrapper function that:\n", "- Accepts input in ValidMind's expected format (with `input` and `session_id` fields)\n", "- Invokes the banking agent with the proper state initialization\n", "- Captures tool outputs and tool calls for evaluation\n", From 054f0ac16b1ccc10cf8d4f7b1870941bed1e8734 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 15:37:25 -0800 Subject: [PATCH 18/54] Save point --- .../agents/document_agentic_ai.ipynb | 93 ++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 3d0c9b749..3a8fd2076 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -743,7 +743,7 @@ "source": [ "#### Create agent wrapper function\n", "\n", - "As ValidMind requires a standardized function interface to invoke and test models, we'll then create a wrapper function that:\n", + "We'll then create a wrapper function that:\n", "\n", "- Accepts input in ValidMind's expected format (with `input` and `session_id` fields)\n", "- Invokes the banking agent with the proper state initialization\n", @@ -892,6 +892,97 @@ "print(f\"Model ID: {vm_banking_model.input_id}\")" ] }, + { + "cell_type": "markdown", + "id": "af84f571", + "metadata": {}, + "source": [ + "### Validate the system prompt\n", + "\n", + "Let's get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", + "\n", + "- **Clarity**: How clearly the prompt states the task.\n", + "- **Conciseness**: How succinctly the prompt states the task.\n", + "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", + "- **NegativeInstruction**: Whether the prompt contains negative instructions.\n", + "- **Specificity**: How specific the prompt defines the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f52dceb1", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Clarity\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70d52333", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Conciseness\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aa89976", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Delimitation\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8630197e", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.NegativeInstruction\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3bd1038", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Specificity\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, { "cell_type": "markdown", "id": "72041947", From 001d8b6f1337367023fd1de491f6a4710af2254e Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 15:51:02 -0800 Subject: [PATCH 19/54] Setup - Running tests --- .../agents/document_agentic_ai.ipynb | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 3a8fd2076..cf5e9ccf8 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -892,6 +892,18 @@ "print(f\"Model ID: {vm_banking_model.input_id}\")" ] }, + { + "cell_type": "markdown", + "id": "72041947", + "metadata": {}, + "source": [ + "## Running tests\n", + "\n", + "You run individual tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module.\n", + "\n", + "In this section, we'll run validation tests on both our defined system prompt as well as evaluation tests on our agent's performance." + ] + }, { "cell_type": "markdown", "id": "af84f571", @@ -899,13 +911,15 @@ "source": [ "### Validate the system prompt\n", "\n", - "Let's get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", + "Let's get an initial sense of how well our defined system prompt meets a few best practices for prompt engineering.\n", "\n", - "- **Clarity**: How clearly the prompt states the task.\n", - "- **Conciseness**: How succinctly the prompt states the task.\n", - "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", - "- **NegativeInstruction**: Whether the prompt contains negative instructions.\n", - "- **Specificity**: How specific the prompt defines the task." + "Passing in our agentic model as an input, the tests below rate the prompt on a scale of 1-10 against the following criteria:\n", + "\n", + "- **[Clarity](https://docs.validmind.ai/tests/prompt_validation/Clarity.html)** — How clearly the prompt states the task.\n", + "- **[Conciseness](https://docs.validmind.ai/tests/prompt_validation/Conciseness.html)** — How succinctly the prompt states the task.\n", + "- **[Delimitation](https://docs.validmind.ai/tests/prompt_validation/Delimitation.html)** — When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", + "- **[NegativeInstruction](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — Whether the prompt contains negative instructions.\n", + "- **[Specificity](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — How specific the prompt defines the task." ] }, { @@ -985,10 +999,10 @@ }, { "cell_type": "markdown", - "id": "72041947", + "id": "99e70a96", "metadata": {}, "source": [ - "## Running tests" + "### Evaluate the banking agent" ] }, { From 1114efd6bc81e188d4b2efb1b062e1d1a268eb20 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 15:54:03 -0800 Subject: [PATCH 20/54] Save point --- notebooks/code_samples/agents/document_agentic_ai.ipynb | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index cf5e9ccf8..caf1f804d 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -246,9 +246,7 @@ "source": [ "### Verify OpenAI API access\n", "\n", - "For the LLM components in this notebook to function properly, you'll need access to OpenAI.\n", - "\n", - "Before continuing, make sure that a valid `OPENAI_API_KEY` is set in your `.env` file:" + "Verify that a valid `OPENAI_API_KEY` is set in your `.env` file:" ] }, { @@ -617,7 +615,7 @@ "1. **LLM node** — Processes user requests, applies the system prompt, and decides whether to use tools.\n", "2. **Tools node** — Executes the selected banking tools when the LLM determines they're needed.\n", "\n", - "The workflow begins with the LLM analyzing the request, then uses tools if needed — or ends if the response is complete, and finally returns to the LLM to generate the final response. The agent will also use memory to maintain conversation context across multiple interactions." + "The workflow begins with the LLM analyzing the request, then uses tools if needed — or ends if the response is complete, and finally returns to the LLM to generate the final response." ] }, { From a168ec409dba259f22bcfb746099048276db4b0d Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 16:04:52 -0800 Subject: [PATCH 21/54] Save point --- .../agents/document_agentic_ai.ipynb | 2291 +++++++++-------- 1 file changed, 1174 insertions(+), 1117 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index caf1f804d..f1614b6de 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1,1121 +1,1178 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "382caa31", - "metadata": {}, - "source": [ - "# Document an agentic AI system\n", - "\n", - "Build and document an agentic AI system with the ValidMind Library. Construct a LangGraph-based banking agent, prepare your datasets and agent for testing, run out-of-the-box and custom tests and log those test results to the ValidMind Platform.\n", - "\n", - "An _AI agent_ is an autonomous system that interprets inputs, selects from available tools or actions, and executes multi-step behaviors to achieve defined goals. In this notebook, the agent acts as a banking assistant that analyzes user requests and automatically selects and invokes the appropriate specialized banking tool to deliver accurate, compliant, and actionable responses.\n", - "\n", - "- This agent enables financial institutions to automate complex banking workflows where different customer requests require different specialized tools and knowledge bases.\n", - "- Effective validation of agentic AI systems reduces the risks of agents misinterpreting inputs, failing to extract required parameters, or producing incorrect assessments or actions — such as selecting the wrong tool.\n", - "\n", - "
For the LLM components in this notebook to function properly, you'll need access to OpenAI.\n", - "

\n", - "Before you continue, ensure that a valid OPENAI_API_KEY is set in your .env file.
" - ] - }, - { - "cell_type": "markdown", - "id": "about-intro-d367bebc-fcd0-46df-93ba-5f731f4b9ba9", - "metadata": {}, - "source": [ - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. \n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators." - ] - }, - { - "cell_type": "markdown", - "id": "about-begin-c1d3d399-73ca-4a88-9ca8-16603eab0751", - "metadata": {}, - "source": [ - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html)." - ] - }, - { - "cell_type": "markdown", - "id": "about-signup-65fed156-8e69-49eb-b04e-ae52c574d8e7", - "metadata": {}, - "source": [ - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
" - ] - }, - { - "cell_type": "markdown", - "id": "about-concepts-f49034ef-bfb8-4643-88dc-5647eb76968a", - "metadata": {}, - "source": [ - "### Key concepts\n", - "\n", - "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", - "\n", - "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Metrics**: A subset of tests that do not have thresholds. In the context of this notebook, metrics and tests can be thought of as interchangeable concepts.\n", - "\n", - "**Custom metrics**: Custom metrics are functions that you define to evaluate your model or dataset. These functions can be registered with the ValidMind Library to be used in the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Custom metrics can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", - "\n", - "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." - ] - }, - { - "cell_type": "markdown", - "id": "cell-d35096df-f3f8-4034-87fa-1973bbdd7b49-efbed180-2181-4735-83c9-7d12b69b9f4c", - "metadata": {}, - "source": [ - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "id": "install-library-20ef5375-d71d-47b1-b669-ed90cb3f0e90", - "metadata": {}, - "source": [ - "### Install the ValidMind Library\n", - "\n", - "
Recommended Python versions\n", - "

\n", - "Python 3.8 <= x <= 3.11
\n", - "\n", - "Let's begin by installing the ValidMind Library with large language model (LLM) support:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "install-python-94668f4a-272a-4d93-b7c2-da735557fd37", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q \"validmind[llm]\" \"langgraph==0.3.21\"" - ] - }, - { - "cell_type": "markdown", - "id": "install-initialize-13c50d00-6167-4147-a886-541d73129898", - "metadata": {}, - "source": [ - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "id": "install-register-ab56529b-d2f2-4942-b18e-603fb7342e5b", - "metadata": {}, - "source": [ - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook.\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "id": "install-template-6e8d6279-0edf-442a-bb63-50f7f89e60b3", - "metadata": {}, - "source": [ - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Agentic AI`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "id": "e4a16ffa", - "metadata": {}, - "source": [ - "
Can't select this template?\n", - "

\n", - "Your organization administrators may need to add it to your template library:\n", - "
" - ] - }, - { - "cell_type": "markdown", - "id": "install-snippet-468eb7c3-612b-4f14-96aa-dc5a3088448c", - "metadata": {}, - "source": [ - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "install-init-9b9b870b-01e8-428d-afbf-8e3d9771b3f4", - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "install-preview-4e403693-e030-4b31-9d39-74a6af8f470f", - "metadata": {}, - "source": [ - "#### Preview the documentation template\n", - "\n", - "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", - "\n", - "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "install-preview-template-b5149386-4ab1-41cb-9527-880573d91509", - "metadata": {}, - "outputs": [], - "source": [ - "vm.preview_template()" - ] - }, - { - "cell_type": "markdown", - "id": "ba45feba", - "metadata": {}, - "source": [ - "### Verify OpenAI API access\n", - "\n", - "Verify that a valid `OPENAI_API_KEY` is set in your `.env` file:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9684fde1", - "metadata": {}, - "outputs": [], - "source": [ - "# Load environment variables if using .env file\n", - "try:\n", - " from dotenv import load_dotenv\n", - " load_dotenv()\n", - "except ImportError:\n", - " print(\"dotenv not installed. Make sure OPENAI_API_KEY is set in your environment.\")" - ] - }, - { - "cell_type": "markdown", - "id": "679111bb", - "metadata": {}, - "source": [ - "### Initialize the Python environment\n", - "\n", - "Let's import all the necessary libraries to prepare for building our banking LangGraph agentic system:\n", - "\n", - "- **Standard libraries** for data handling and environment management.\n", - "- **pandas**, a Python library for data manipulation and analytics, as an alias. We'll also configure pandas to show all columns and all rows at full width for easier debugging and inspection.\n", - "- **LangChain** components for LLM integration and tool management.\n", - "- **LangGraph** for building stateful, multi-step agent workflows.\n", - "- **Banking tools** for specialized financial services as defined in [banking_tools.py](banking_tools.py)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a64a021", - "metadata": {}, - "outputs": [], - "source": [ - "# STANDARD LIBRARY IMPORTS\n", - "\n", - "# TypedDict: Defines type-safe dictionaries for the agent's state structure\n", - "# Annotated: Adds metadata to type hints\n", - "# Sequence: Type hint for sequences used in the agent\n", - "from typing import TypedDict, Annotated, Sequence\n", - "\n", - "# THIRD PARTY IMPORTS\n", - "\n", - "import pandas as pd\n", - "# Configure pandas to show all columns and all rows at full width\n", - "pd.set_option('display.max_columns', None)\n", - "pd.set_option('display.max_colwidth', None)\n", - "pd.set_option('display.width', None)\n", - "pd.set_option('display.max_rows', None)\n", - "\n", - "# BaseMessage: Represents a base message in the LangChain message system\n", - "# HumanMessage: Represents a human message in the LangChain message system\n", - "# SystemMessage: Represents a system message in the LangChain message system\n", - "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", - "\n", - "# ChatOpenAI: Represents an OpenAI chat model in the LangChain library\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "# MemorySaver: Represents a checkpoint for saving and restoring agent state\n", - "from langgraph.checkpoint.memory import MemorySaver\n", - "\n", - "# StateGraph: Represents a stateful graph in the LangGraph library\n", - "# END: Represents the end of a graph\n", - "# START: Represents the start of a graph\n", - "from langgraph.graph import StateGraph, END, START\n", - "\n", - "# add_messages: Adds messages to the state\n", - "from langgraph.graph.message import add_messages\n", - "\n", - "# ToolNode: Represents a tool node in the LangGraph library\n", - "from langgraph.prebuilt import ToolNode\n", - "\n", - "# LOCAL IMPORTS FROM banking_tools.py\n", - "\n", - "from banking_tools import AVAILABLE_TOOLS" - ] - }, - { - "cell_type": "markdown", - "id": "cf6ebc6c", - "metadata": {}, - "source": [ - "## Building the LangGraph agent" - ] - }, - { - "cell_type": "markdown", - "id": "bf4fc0d7", - "metadata": {}, - "source": [ - "### Test available banking tools\n", - "\n", - "We'll use the demo banking tools defined in `banking_tools.py` that provide use cases of financial services:\n", - "\n", - "- **Credit Risk Analyzer** - Loan applications and credit decisions\n", - "- **Customer Account Manager** - Account services and customer support\n", - "- **Fraud Detection System** - Security and fraud prevention" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c862fdd", - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Available tools: {len(AVAILABLE_TOOLS)}\")\n", - "print(\"\\nTool Details:\")\n", - "for i, tool in enumerate(AVAILABLE_TOOLS, 1):\n", - " print(f\" - {tool.name}\")" - ] - }, - { - "cell_type": "markdown", - "id": "4d6f0e26", - "metadata": {}, - "source": [ - "Let's test each banking tool individually to ensure they're working correctly before integrating them into our agent:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc0caff2", - "metadata": {}, - "outputs": [], - "source": [ - "# Test 1: Credit Risk Analyzer\n", - "print(\"TEST 1: Credit Risk Analyzer\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " # Access the underlying function using .func\n", - " credit_result = AVAILABLE_TOOLS[0].func(\n", - " customer_income=75000,\n", - " customer_debt=1200,\n", - " credit_score=720,\n", - " loan_amount=50000,\n", - " loan_type=\"personal\"\n", - " )\n", - " print(credit_result)\n", - " print(\"Credit Risk Analyzer test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Credit Risk Analyzer test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6b227db", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Test 2: Customer Account Manager\n", - "print(\"TEST 2: Customer Account Manager\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " # Test checking balance\n", - " account_result = AVAILABLE_TOOLS[1].func(\n", - " account_type=\"checking\",\n", - " customer_id=\"12345\",\n", - " action=\"check_balance\"\n", - " )\n", - " print(account_result)\n", - "\n", - " # Test getting account info\n", - " info_result = AVAILABLE_TOOLS[1].func(\n", - " account_type=\"all\",\n", - " customer_id=\"12345\", \n", - " action=\"get_info\"\n", - " )\n", - " print(info_result)\n", - " print(\"Customer Account Manager test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Customer Account Manager test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8442bf81", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Test 3: Fraud Detection System\n", - "print(\"TEST 3: Fraud Detection System\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " fraud_result = AVAILABLE_TOOLS[2].func(\n", - " transaction_id=\"TX123\",\n", - " customer_id=\"12345\",\n", - " transaction_amount=500.00,\n", - " transaction_type=\"withdrawal\",\n", - " location=\"Miami, FL\",\n", - " device_id=\"DEVICE_001\"\n", - " )\n", - " print(fraud_result)\n", - " print(\"Fraud Detection System test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Fraud Detection System test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)" - ] - }, - { - "cell_type": "markdown", - "id": "5ed83560", - "metadata": {}, - "source": [ - "### Create LangGraph banking agent\n", - "\n", - "With our tools ready to go, we'll create our intelligent banking agent with LangGraph that automatically selects and uses the appropriate banking tool based on a user request." - ] - }, - { - "cell_type": "markdown", - "id": "6a5beb28", - "metadata": {}, - "source": [ - "#### Define system prompt\n", - "\n", - "We'll begin by defining our system prompt, which provides the LLM with context about its role as a banking assistant and guidance on when to use each available tool:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64f46a1c", - "metadata": {}, - "outputs": [], - "source": [ - "# Enhanced banking system prompt with tool selection guidance\n", - "system_context = \"\"\"You are a professional banking AI assistant with access to specialized banking tools.\n", - " Analyze the user's banking request and directly use the most appropriate tools to help them.\n", - "\n", - " AVAILABLE BANKING TOOLS:\n", - "\n", - " credit_risk_analyzer - Analyze credit risk for loan applications and credit decisions\n", - " - Use for: loan applications, credit assessments, risk analysis, mortgage eligibility\n", - " - Examples: \"Analyze credit risk for $50k personal loan\", \"Assess mortgage eligibility for $300k home purchase\"\n", - " - Parameters: customer_income, customer_debt, credit_score, loan_amount, loan_type\n", - "\n", - " customer_account_manager - Manage customer accounts and provide banking services\n", - " - Use for: account information, transaction processing, product recommendations, customer service\n", - " - Examples: \"Check balance for checking account 12345\", \"Recommend products for customer with high balance\"\n", - " - Parameters: account_type, customer_id, action, amount, account_details\n", - "\n", - " fraud_detection_system - Analyze transactions for potential fraud and security risks\n", - " - Use for: transaction monitoring, fraud prevention, risk assessment, security alerts\n", - " - Examples: \"Analyze fraud risk for $500 ATM withdrawal in Miami\", \"Check security for $2000 online purchase\"\n", - " - Parameters: transaction_id, customer_id, transaction_amount, transaction_type, location, device_id\n", - "\n", - " BANKING INSTRUCTIONS:\n", - " - Analyze the user's banking request carefully and identify the primary need\n", - " - If they need credit analysis → use credit_risk_analyzer\n", - " - If they need financial calculations → use financial_calculator\n", - " - If they need account services → use customer_account_manager\n", - " - If they need security analysis → use fraud_detection_system\n", - " - Extract relevant parameters from the user's request\n", - " - Provide helpful, accurate banking responses based on tool outputs\n", - " - Always consider banking regulations, risk management, and best practices\n", - " - Be professional and thorough in your analysis\n", - "\n", - " Choose and use tools wisely to provide the most helpful banking assistance.\n", - " \"\"\"" - ] - }, - { - "cell_type": "markdown", - "id": "3f8da88d", - "metadata": {}, - "source": [ - "#### Initialize the LLM\n", - "\n", - "Let's initialize the LLM that will power our banking agent:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b828d70", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the main LLM for banking responses\n", - "main_llm = ChatOpenAI(\n", - " model=\"gpt-5-mini\",\n", - " reasoning={\n", - " \"effort\": \"low\",\n", - " \"summary\": \"auto\"\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "866b59cb", - "metadata": {}, - "source": [ - "Then bind the available banking tools to the LLM, enabling the model to automatically recognize and invoke each tool when appropriate based on request input and the system prompt we defined above:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65f85d86", - "metadata": {}, - "outputs": [], - "source": [ - "# Bind all banking tools to the main LLM\n", - "llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)" - ] - }, - { - "cell_type": "markdown", - "id": "5f898062", - "metadata": {}, - "source": [ - "#### Define agent state structure\n", - "\n", - "The agent state defines the data structure that flows through the LangGraph workflow. It includes:\n", - "\n", - "- **messages** — The conversation history between the user and agent\n", - "- **user_input** — The current user request\n", - "- **session_id** — A unique identifier for the conversation session\n", - "- **context** — Additional context that can be passed between nodes\n", - "\n", - "Defining this state structure maintains the structure throughout the agent's execution and allows for multi-turn conversations with memory:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7feeebb", - "metadata": {}, - "outputs": [], - "source": [ - "# Banking Agent State Definition\n", - "class BankingAgentState(TypedDict):\n", - " messages: Annotated[Sequence[BaseMessage], add_messages]\n", - " user_input: str\n", - " session_id: str\n", - " context: dict" - ] - }, - { - "cell_type": "markdown", - "id": "31b261b2", - "metadata": {}, - "source": [ - "#### Create agent workflow function\n", - "\n", - "We'll build the LangGraph agent workflow with two main components:\n", - "\n", - "1. **LLM node** — Processes user requests, applies the system prompt, and decides whether to use tools.\n", - "2. **Tools node** — Executes the selected banking tools when the LLM determines they're needed.\n", - "\n", - "The workflow begins with the LLM analyzing the request, then uses tools if needed — or ends if the response is complete, and finally returns to the LLM to generate the final response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "142c20b9", - "metadata": {}, - "outputs": [], - "source": [ - "def create_banking_langgraph_agent():\n", - " \"\"\"Create a comprehensive LangGraph banking agent with intelligent tool selection.\"\"\"\n", - " def llm_node(state: BankingAgentState) -> BankingAgentState:\n", - " \"\"\"Main LLM node that processes banking requests and selects appropriate tools.\"\"\"\n", - " messages = state[\"messages\"]\n", - " # Add system context to messages\n", - " enhanced_messages = [SystemMessage(content=system_context)] + list(messages)\n", - " # Get LLM response with tool selection\n", - " response = llm_with_tools.invoke(enhanced_messages)\n", - " return {\n", - " **state,\n", - " \"messages\": messages + [response]\n", - " }\n", - " \n", - " def should_continue(state: BankingAgentState) -> str:\n", - " \"\"\"Decide whether to use tools or end the conversation.\"\"\"\n", - " last_message = state[\"messages\"][-1]\n", - " # Check if the LLM wants to use tools\n", - " if hasattr(last_message, 'tool_calls') and last_message.tool_calls:\n", - " return \"tools\"\n", - " return END\n", - " \n", - " # Create the banking state graph\n", - " workflow = StateGraph(BankingAgentState)\n", - " # Add nodes\n", - " workflow.add_node(\"llm\", llm_node)\n", - " workflow.add_node(\"tools\", ToolNode(AVAILABLE_TOOLS))\n", - " # Simplified entry point - go directly to LLM\n", - " workflow.add_edge(START, \"llm\")\n", - " # From LLM, decide whether to use tools or end\n", - " workflow.add_conditional_edges(\n", - " \"llm\",\n", - " should_continue,\n", - " {\"tools\": \"tools\", END: END}\n", - " )\n", - " # Tool execution flows back to LLM for final response\n", - " workflow.add_edge(\"tools\", \"llm\")\n", - " # Set up memory\n", - " memory = MemorySaver()\n", - " # Compile the graph\n", - " agent = workflow.compile(checkpointer=memory)\n", - " return agent" - ] - }, - { - "cell_type": "markdown", - "id": "0f19d4b7", - "metadata": {}, - "source": [ - "#### Instantiate the banking agent\n", - "\n", - "Now, we'll create an instance of the banking agent by calling the workflow creation function.\n", - "\n", - "This compiled agent is ready to process banking requests and will automatically select and use the appropriate tools based on user queries:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aabb7842", - "metadata": {}, - "outputs": [], - "source": [ - "# Create the banking intelligent agent\n", - "banking_agent = create_banking_langgraph_agent()\n", - "\n", - "print(\"Banking LangGraph Agent Created Successfully!\")\n", - "print(\"\\nFeatures:\")\n", - "print(\" - Intelligent banking tool selection\")\n", - "print(\" - Comprehensive banking system prompt\")\n", - "print(\" - Streamlined workflow: LLM → Tools → Response\")\n", - "print(\" - Automatic tool parameter extraction\")\n", - "print(\" - Professional banking assistance\")" - ] - }, - { - "cell_type": "markdown", - "id": "cfd302bb", - "metadata": {}, - "source": [ - "### Integrate agent with ValidMind\n", - "\n", - "To integrate our LangGraph banking agent with ValidMind, we need to create a wrapper function that ValidMind can use to invoke the agent and extract the necessary information for testing and documentation, allowing ValidMind to run validation tests on the agent's behavior, tool usage, and responses." - ] - }, - { - "cell_type": "markdown", - "id": "e2540236", - "metadata": {}, - "source": [ - "#### Import ValidMind components\n", - "\n", - "We'll start with importing the necessary ValidMind components for integrating our agent:\n", - "\n", - "- `Prompt` from `validmind.models` for handling prompt-based model inputs\n", - "- `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list` from `validmind.scorers.llm.deepeval` for extracting and converting tool calls from agent outputs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67557905", - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.models import Prompt\n", - "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list" - ] - }, - { - "cell_type": "markdown", - "id": "c30dd6b1", - "metadata": {}, - "source": [ - "#### Create agent wrapper function\n", - "\n", - "We'll then create a wrapper function that:\n", - "\n", - "- Accepts input in ValidMind's expected format (with `input` and `session_id` fields)\n", - "- Invokes the banking agent with the proper state initialization\n", - "- Captures tool outputs and tool calls for evaluation\n", - "- Returns a standardized response format that includes the prediction, full output, tool messages, and tool call information\n", - "- Handles errors gracefully with fallback responses" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db1fcc20", - "metadata": {}, - "outputs": [], - "source": [ - "def banking_agent_fn(input):\n", - " \"\"\"\n", - " Invoke the banking agent with the given input.\n", - " \"\"\"\n", - " try:\n", - " # Initial state for banking agent\n", - " initial_state = {\n", - " \"user_input\": input[\"input\"],\n", - " \"messages\": [HumanMessage(content=input[\"input\"])],\n", - " \"session_id\": input[\"session_id\"],\n", - " \"context\": {}\n", - " }\n", - " session_config = {\"configurable\": {\"thread_id\": input[\"session_id\"]}}\n", - " result = banking_agent.invoke(initial_state, config=session_config)\n", - "\n", - " from utils import capture_tool_output_messages\n", - "\n", - " # Capture all tool outputs and metadata\n", - " captured_data = capture_tool_output_messages(result)\n", - " \n", - " # Access specific tool outputs, this will be used for RAGAS tests\n", - " tool_message = \"\"\n", - " for output in captured_data[\"tool_outputs\"]:\n", - " tool_message += output['content']\n", - " \n", - " tool_calls_found = []\n", - " messages = result['messages']\n", - " for message in messages:\n", - " if hasattr(message, 'tool_calls') and message.tool_calls:\n", - " for tool_call in message.tool_calls:\n", - " # Handle both dictionary and object formats\n", - " if isinstance(tool_call, dict):\n", - " tool_calls_found.append(tool_call['name'])\n", - " else:\n", - " # ToolCall object - use attribute access\n", - " tool_calls_found.append(tool_call.name)\n", - "\n", - "\n", - " return {\n", - " \"prediction\": result['messages'][-1].content[0]['text'],\n", - " \"output\": result,\n", - " \"tool_messages\": [tool_message],\n", - " # \"tool_calls\": tool_calls_found,\n", - " \"tool_called\": _convert_to_tool_call_list(extract_tool_calls_from_agent_output(result))\n", - " }\n", - " except Exception as e:\n", - " # Return a fallback response if the agent fails\n", - " error_message = f\"\"\"I apologize, but I encountered an error while processing your banking request: {str(e)}.\n", - " Please try rephrasing your question or contact support if the issue persists.\"\"\"\n", - " return {\n", - " \"prediction\": error_message, \n", - " \"output\": {\n", - " \"messages\": [HumanMessage(content=input[\"input\"]), SystemMessage(content=error_message)],\n", - " \"error\": str(e)\n", - " }\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "4ea44f1e", - "metadata": {}, - "source": [ - "#### Initialize the ValidMind model object\n", - "\n", - "\n", - "We'll also need to register the banking agent as a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data.\n", - "\n", - "You simply initialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model) that:\n", - "\n", - "- Associates the wrapper function with the model for prediction\n", - "- Stores the system prompt template for documentation\n", - "- Provides a unique `input_id` for tracking and identification\n", - "- Enables the agent to be used with ValidMind's testing and documentation features" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4389e36", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the agent as a model\n", - "vm_banking_model = vm.init_model(\n", - " input_id=\"banking_agent_model\",\n", - " predict_fn=banking_agent_fn,\n", - " prompt=Prompt(template=system_context)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "cd6eb68b", - "metadata": {}, - "source": [ - "#### Store the agent reference\n", - "\n", - "We'll also store a reference to the original banking agent object in the ValidMind model. This allows us to access the full agent functionality directly if needed, while still maintaining the wrapper function interface for ValidMind's testing framework." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e39d400", - "metadata": {}, - "outputs": [], - "source": [ - "# Add the banking agent to the vm model\n", - "vm_banking_model.model = banking_agent" - ] - }, - { - "cell_type": "markdown", - "id": "2db4b849", - "metadata": {}, - "source": [ - "#### Verify integration\n", - "\n", - "Let's confirm that the banking agent has been successfully integrated with ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59afbb6d", - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Banking Agent Successfully Integrated with ValidMind!\")\n", - "print(f\"Model ID: {vm_banking_model.input_id}\")" - ] - }, - { - "cell_type": "markdown", - "id": "72041947", - "metadata": {}, - "source": [ - "## Running tests\n", - "\n", - "You run individual tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module.\n", - "\n", - "In this section, we'll run validation tests on both our defined system prompt as well as evaluation tests on our agent's performance." - ] - }, - { - "cell_type": "markdown", - "id": "af84f571", - "metadata": {}, - "source": [ - "### Validate the system prompt\n", - "\n", - "Let's get an initial sense of how well our defined system prompt meets a few best practices for prompt engineering.\n", - "\n", - "Passing in our agentic model as an input, the tests below rate the prompt on a scale of 1-10 against the following criteria:\n", - "\n", - "- **[Clarity](https://docs.validmind.ai/tests/prompt_validation/Clarity.html)** — How clearly the prompt states the task.\n", - "- **[Conciseness](https://docs.validmind.ai/tests/prompt_validation/Conciseness.html)** — How succinctly the prompt states the task.\n", - "- **[Delimitation](https://docs.validmind.ai/tests/prompt_validation/Delimitation.html)** — When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", - "- **[NegativeInstruction](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — Whether the prompt contains negative instructions.\n", - "- **[Specificity](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — How specific the prompt defines the task." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f52dceb1", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Clarity\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "70d52333", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Conciseness\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5aa89976", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Delimitation\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8630197e", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.NegativeInstruction\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3bd1038", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Specificity\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "99e70a96", - "metadata": {}, - "source": [ - "### Evaluate the banking agent" - ] - }, - { - "cell_type": "markdown", - "id": "next-steps-24e25294-3e59-4982-92d0-a998cfbe3bb5", - "metadata": {}, - "source": [ - "## Next steps\n", - "\n", - "You can look at the output produced by the ValidMind Library right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation." - ] - }, - { - "cell_type": "markdown", - "id": "next-docs-7773f25a-ef1c-40d1-9b4b-f6cecdf7dc6c", - "metadata": {}, - "source": [ - "### Work with your model documentation\n", - "\n", - "1. From the **Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", - "\n", - "2. In the left sidebar that appears for your model, click **Documentation** under Documents.\n", - "\n", - "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)" - ] - }, - { - "cell_type": "markdown", - "id": "next-resources-779f3903-eb13-4242-9438-e0dfc1753d4d", - "metadata": {}, - "source": [ - "### Discover more learning resources\n", - "\n", - "We offer many interactive notebooks to help you document models:\n", - "\n", - "- [Run tests & test suites](https://docs.validmind.ai/guide/testing-overview.html)\n", - "- [Code samples](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", - "\n", - "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." - ] - }, - { - "cell_type": "markdown", - "id": "upgrade-vm-362dbd51-d699-405b-b817-f95381861976", - "metadata": {}, - "source": [ - "## Upgrade ValidMind\n", - "\n", - "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", - "\n", - "Retrieve the information for the currently installed version of ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "upgrade-show-2ce32ad2-2751-4b40-a1ec-161b178603b1", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "%pip show validmind" - ] - }, - { - "cell_type": "markdown", - "id": "upgrade-version-61359c61-7218-4a90-9f6f-70794c42d997", - "metadata": {}, - "source": [ - "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", - "\n", - "```bash\n", - "%pip install --upgrade validmind\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "upgrade-restart-c45548b7-aee3-4b96-b412-37c781071787", - "metadata": {}, - "source": [ - "You may need to restart your kernel after running the upgrade package for changes to be applied." - ] - }, - { - "cell_type": "markdown", - "id": "copyright-99bac407-03cd-4fe3-83c8-0a280d4caa64", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.10" + "cells": [ + { + "cell_type": "markdown", + "id": "382caa31", + "metadata": {}, + "source": [ + "# Document an agentic AI system\n", + "\n", + "Build and document an agentic AI system with the ValidMind Library. Construct a LangGraph-based banking agent, prepare your datasets and agent for testing, run out-of-the-box and custom tests and log those test results to the ValidMind Platform.\n", + "\n", + "An _AI agent_ is an autonomous system that interprets inputs, selects from available tools or actions, and executes multi-step behaviors to achieve defined goals. In this notebook, the agent acts as a banking assistant that analyzes user requests and automatically selects and invokes the appropriate specialized banking tool to deliver accurate, compliant, and actionable responses.\n", + "\n", + "- This agent enables financial institutions to automate complex banking workflows where different customer requests require different specialized tools and knowledge bases.\n", + "- Effective validation of agentic AI systems reduces the risks of agents misinterpreting inputs, failing to extract required parameters, or producing incorrect assessments or actions — such as selecting the wrong tool.\n", + "\n", + "
For the LLM components in this notebook to function properly, you'll need access to OpenAI.\n", + "

\n", + "Before you continue, ensure that a valid OPENAI_API_KEY is set in your .env file.
" + ] + }, + { + "cell_type": "markdown", + "id": "about-intro-d367bebc-fcd0-46df-93ba-5f731f4b9ba9", + "metadata": {}, + "source": [ + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. \n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators." + ] + }, + { + "cell_type": "markdown", + "id": "about-begin-c1d3d399-73ca-4a88-9ca8-16603eab0751", + "metadata": {}, + "source": [ + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html)." + ] + }, + { + "cell_type": "markdown", + "id": "about-signup-65fed156-8e69-49eb-b04e-ae52c574d8e7", + "metadata": {}, + "source": [ + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
" + ] + }, + { + "cell_type": "markdown", + "id": "about-concepts-f49034ef-bfb8-4643-88dc-5647eb76968a", + "metadata": {}, + "source": [ + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Metrics**: A subset of tests that do not have thresholds. In the context of this notebook, metrics and tests can be thought of as interchangeable concepts.\n", + "\n", + "**Custom metrics**: Custom metrics are functions that you define to evaluate your model or dataset. These functions can be registered with the ValidMind Library to be used in the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom metrics can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "id": "cell-d35096df-f3f8-4034-87fa-1973bbdd7b49-efbed180-2181-4735-83c9-7d12b69b9f4c", + "metadata": {}, + "source": [ + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "id": "install-library-20ef5375-d71d-47b1-b669-ed90cb3f0e90", + "metadata": {}, + "source": [ + "### Install the ValidMind Library\n", + "\n", + "
Recommended Python versions\n", + "

\n", + "Python 3.8 <= x <= 3.11
\n", + "\n", + "Let's begin by installing the ValidMind Library with large language model (LLM) support:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-python-94668f4a-272a-4d93-b7c2-da735557fd37", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q \"validmind[llm]\" \"langgraph==0.3.21\"" + ] + }, + { + "cell_type": "markdown", + "id": "install-initialize-13c50d00-6167-4147-a886-541d73129898", + "metadata": {}, + "source": [ + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "id": "install-register-ab56529b-d2f2-4942-b18e-603fb7342e5b", + "metadata": {}, + "source": [ + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook.\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "id": "install-template-6e8d6279-0edf-442a-bb63-50f7f89e60b3", + "metadata": {}, + "source": [ + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Agentic AI`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "id": "e4a16ffa", + "metadata": {}, + "source": [ + "
Can't select this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "install-snippet-468eb7c3-612b-4f14-96aa-dc5a3088448c", + "metadata": {}, + "source": [ + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-init-9b9b870b-01e8-428d-afbf-8e3d9771b3f4", + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "install-preview-4e403693-e030-4b31-9d39-74a6af8f470f", + "metadata": {}, + "source": [ + "#### Preview the documentation template\n", + "\n", + "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", + "\n", + "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-preview-template-b5149386-4ab1-41cb-9527-880573d91509", + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "id": "ba45feba", + "metadata": {}, + "source": [ + "### Verify OpenAI API access\n", + "\n", + "Verify that a valid `OPENAI_API_KEY` is set in your `.env` file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9684fde1", + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables if using .env file\n", + "try:\n", + " from dotenv import load_dotenv\n", + " load_dotenv()\n", + "except ImportError:\n", + " print(\"dotenv not installed. Make sure OPENAI_API_KEY is set in your environment.\")" + ] + }, + { + "cell_type": "markdown", + "id": "679111bb", + "metadata": {}, + "source": [ + "### Initialize the Python environment\n", + "\n", + "Let's import all the necessary libraries to prepare for building our banking LangGraph agentic system:\n", + "\n", + "- **Standard libraries** for data handling and environment management.\n", + "- **pandas**, a Python library for data manipulation and analytics, as an alias. We'll also configure pandas to show all columns and all rows at full width for easier debugging and inspection.\n", + "- **LangChain** components for LLM integration and tool management.\n", + "- **LangGraph** for building stateful, multi-step agent workflows.\n", + "- **Banking tools** for specialized financial services as defined in [banking_tools.py](banking_tools.py)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a64a021", + "metadata": {}, + "outputs": [], + "source": [ + "# STANDARD LIBRARY IMPORTS\n", + "\n", + "# TypedDict: Defines type-safe dictionaries for the agent's state structure\n", + "# Annotated: Adds metadata to type hints\n", + "# Sequence: Type hint for sequences used in the agent\n", + "from typing import TypedDict, Annotated, Sequence\n", + "\n", + "# THIRD PARTY IMPORTS\n", + "\n", + "import pandas as pd\n", + "# Configure pandas to show all columns and all rows at full width\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.width', None)\n", + "pd.set_option('display.max_rows', None)\n", + "\n", + "# BaseMessage: Represents a base message in the LangChain message system\n", + "# HumanMessage: Represents a human message in the LangChain message system\n", + "# SystemMessage: Represents a system message in the LangChain message system\n", + "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", + "\n", + "# ChatOpenAI: Represents an OpenAI chat model in the LangChain library\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "# MemorySaver: Represents a checkpoint for saving and restoring agent state\n", + "from langgraph.checkpoint.memory import MemorySaver\n", + "\n", + "# StateGraph: Represents a stateful graph in the LangGraph library\n", + "# END: Represents the end of a graph\n", + "# START: Represents the start of a graph\n", + "from langgraph.graph import StateGraph, END, START\n", + "\n", + "# add_messages: Adds messages to the state\n", + "from langgraph.graph.message import add_messages\n", + "\n", + "# ToolNode: Represents a tool node in the LangGraph library\n", + "from langgraph.prebuilt import ToolNode\n", + "\n", + "# LOCAL IMPORTS FROM banking_tools.py\n", + "\n", + "from banking_tools import AVAILABLE_TOOLS" + ] + }, + { + "cell_type": "markdown", + "id": "cf6ebc6c", + "metadata": {}, + "source": [ + "## Building the LangGraph agent" + ] + }, + { + "cell_type": "markdown", + "id": "bf4fc0d7", + "metadata": {}, + "source": [ + "### Test available banking tools\n", + "\n", + "We'll use the demo banking tools defined in `banking_tools.py` that provide use cases of financial services:\n", + "\n", + "- **Credit Risk Analyzer** - Loan applications and credit decisions\n", + "- **Customer Account Manager** - Account services and customer support\n", + "- **Fraud Detection System** - Security and fraud prevention" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c862fdd", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Available tools: {len(AVAILABLE_TOOLS)}\")\n", + "print(\"\\nTool Details:\")\n", + "for i, tool in enumerate(AVAILABLE_TOOLS, 1):\n", + " print(f\" - {tool.name}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4d6f0e26", + "metadata": {}, + "source": [ + "Let's test each banking tool individually to ensure they're working correctly before integrating them into our agent:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc0caff2", + "metadata": {}, + "outputs": [], + "source": [ + "# Test 1: Credit Risk Analyzer\n", + "print(\"TEST 1: Credit Risk Analyzer\")\n", + "print(\"-\" * 40)\n", + "try:\n", + " # Access the underlying function using .func\n", + " credit_result = AVAILABLE_TOOLS[0].func(\n", + " customer_income=75000,\n", + " customer_debt=1200,\n", + " credit_score=720,\n", + " loan_amount=50000,\n", + " loan_type=\"personal\"\n", + " )\n", + " print(credit_result)\n", + " print(\"Credit Risk Analyzer test PASSED\")\n", + "except Exception as e:\n", + " print(f\"Credit Risk Analyzer test FAILED: {e}\")\n", + "\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6b227db", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Test 2: Customer Account Manager\n", + "print(\"TEST 2: Customer Account Manager\")\n", + "print(\"-\" * 40)\n", + "try:\n", + " # Test checking balance\n", + " account_result = AVAILABLE_TOOLS[1].func(\n", + " account_type=\"checking\",\n", + " customer_id=\"12345\",\n", + " action=\"check_balance\"\n", + " )\n", + " print(account_result)\n", + "\n", + " # Test getting account info\n", + " info_result = AVAILABLE_TOOLS[1].func(\n", + " account_type=\"all\",\n", + " customer_id=\"12345\", \n", + " action=\"get_info\"\n", + " )\n", + " print(info_result)\n", + " print(\"Customer Account Manager test PASSED\")\n", + "except Exception as e:\n", + " print(f\"Customer Account Manager test FAILED: {e}\")\n", + "\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8442bf81", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Test 3: Fraud Detection System\n", + "print(\"TEST 3: Fraud Detection System\")\n", + "print(\"-\" * 40)\n", + "try:\n", + " fraud_result = AVAILABLE_TOOLS[2].func(\n", + " transaction_id=\"TX123\",\n", + " customer_id=\"12345\",\n", + " transaction_amount=500.00,\n", + " transaction_type=\"withdrawal\",\n", + " location=\"Miami, FL\",\n", + " device_id=\"DEVICE_001\"\n", + " )\n", + " print(fraud_result)\n", + " print(\"Fraud Detection System test PASSED\")\n", + "except Exception as e:\n", + " print(f\"Fraud Detection System test FAILED: {e}\")\n", + "\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "markdown", + "id": "5ed83560", + "metadata": {}, + "source": [ + "### Create LangGraph banking agent\n", + "\n", + "With our tools ready to go, we'll create our intelligent banking agent with LangGraph that automatically selects and uses the appropriate banking tool based on a user request." + ] + }, + { + "cell_type": "markdown", + "id": "6a5beb28", + "metadata": {}, + "source": [ + "#### Define system prompt\n", + "\n", + "We'll begin by defining our system prompt, which provides the LLM with context about its role as a banking assistant and guidance on when to use each available tool:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64f46a1c", + "metadata": {}, + "outputs": [], + "source": [ + "# Enhanced banking system prompt with tool selection guidance\n", + "system_context = \"\"\"You are a professional banking AI assistant with access to specialized banking tools.\n", + " Analyze the user's banking request and directly use the most appropriate tools to help them.\n", + "\n", + " AVAILABLE BANKING TOOLS:\n", + "\n", + " credit_risk_analyzer - Analyze credit risk for loan applications and credit decisions\n", + " - Use for: loan applications, credit assessments, risk analysis, mortgage eligibility\n", + " - Examples: \"Analyze credit risk for $50k personal loan\", \"Assess mortgage eligibility for $300k home purchase\"\n", + " - Parameters: customer_income, customer_debt, credit_score, loan_amount, loan_type\n", + "\n", + " customer_account_manager - Manage customer accounts and provide banking services\n", + " - Use for: account information, transaction processing, product recommendations, customer service\n", + " - Examples: \"Check balance for checking account 12345\", \"Recommend products for customer with high balance\"\n", + " - Parameters: account_type, customer_id, action, amount, account_details\n", + "\n", + " fraud_detection_system - Analyze transactions for potential fraud and security risks\n", + " - Use for: transaction monitoring, fraud prevention, risk assessment, security alerts\n", + " - Examples: \"Analyze fraud risk for $500 ATM withdrawal in Miami\", \"Check security for $2000 online purchase\"\n", + " - Parameters: transaction_id, customer_id, transaction_amount, transaction_type, location, device_id\n", + "\n", + " BANKING INSTRUCTIONS:\n", + " - Analyze the user's banking request carefully and identify the primary need\n", + " - If they need credit analysis → use credit_risk_analyzer\n", + " - If they need financial calculations → use financial_calculator\n", + " - If they need account services → use customer_account_manager\n", + " - If they need security analysis → use fraud_detection_system\n", + " - Extract relevant parameters from the user's request\n", + " - Provide helpful, accurate banking responses based on tool outputs\n", + " - Always consider banking regulations, risk management, and best practices\n", + " - Be professional and thorough in your analysis\n", + "\n", + " Choose and use tools wisely to provide the most helpful banking assistance.\n", + " \"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "3f8da88d", + "metadata": {}, + "source": [ + "#### Initialize the LLM\n", + "\n", + "Let's initialize the LLM that will power our banking agent:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b828d70", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the main LLM for banking responses\n", + "main_llm = ChatOpenAI(\n", + " model=\"gpt-5-mini\",\n", + " reasoning={\n", + " \"effort\": \"low\",\n", + " \"summary\": \"auto\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "866b59cb", + "metadata": {}, + "source": [ + "Then bind the available banking tools to the LLM, enabling the model to automatically recognize and invoke each tool when appropriate based on request input and the system prompt we defined above:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65f85d86", + "metadata": {}, + "outputs": [], + "source": [ + "# Bind all banking tools to the main LLM\n", + "llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)" + ] + }, + { + "cell_type": "markdown", + "id": "5f898062", + "metadata": {}, + "source": [ + "#### Define agent state structure\n", + "\n", + "The agent state defines the data structure that flows through the LangGraph workflow. It includes:\n", + "\n", + "- **messages** — The conversation history between the user and agent\n", + "- **user_input** — The current user request\n", + "- **session_id** — A unique identifier for the conversation session\n", + "- **context** — Additional context that can be passed between nodes\n", + "\n", + "Defining this state structure maintains the structure throughout the agent's execution and allows for multi-turn conversations with memory:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7feeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# Banking Agent State Definition\n", + "class BankingAgentState(TypedDict):\n", + " messages: Annotated[Sequence[BaseMessage], add_messages]\n", + " user_input: str\n", + " session_id: str\n", + " context: dict" + ] + }, + { + "cell_type": "markdown", + "id": "31b261b2", + "metadata": {}, + "source": [ + "#### Create agent workflow function\n", + "\n", + "We'll build the LangGraph agent workflow with two main components:\n", + "\n", + "1. **LLM node** — Processes user requests, applies the system prompt, and decides whether to use tools.\n", + "2. **Tools node** — Executes the selected banking tools when the LLM determines they're needed.\n", + "\n", + "The workflow begins with the LLM analyzing the request, then uses tools if needed — or ends if the response is complete, and finally returns to the LLM to generate the final response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "142c20b9", + "metadata": {}, + "outputs": [], + "source": [ + "def create_banking_langgraph_agent():\n", + " \"\"\"Create a comprehensive LangGraph banking agent with intelligent tool selection.\"\"\"\n", + " def llm_node(state: BankingAgentState) -> BankingAgentState:\n", + " \"\"\"Main LLM node that processes banking requests and selects appropriate tools.\"\"\"\n", + " messages = state[\"messages\"]\n", + " # Add system context to messages\n", + " enhanced_messages = [SystemMessage(content=system_context)] + list(messages)\n", + " # Get LLM response with tool selection\n", + " response = llm_with_tools.invoke(enhanced_messages)\n", + " return {\n", + " **state,\n", + " \"messages\": messages + [response]\n", + " }\n", + " \n", + " def should_continue(state: BankingAgentState) -> str:\n", + " \"\"\"Decide whether to use tools or end the conversation.\"\"\"\n", + " last_message = state[\"messages\"][-1]\n", + " # Check if the LLM wants to use tools\n", + " if hasattr(last_message, 'tool_calls') and last_message.tool_calls:\n", + " return \"tools\"\n", + " return END\n", + " \n", + " # Create the banking state graph\n", + " workflow = StateGraph(BankingAgentState)\n", + " # Add nodes\n", + " workflow.add_node(\"llm\", llm_node)\n", + " workflow.add_node(\"tools\", ToolNode(AVAILABLE_TOOLS))\n", + " # Simplified entry point - go directly to LLM\n", + " workflow.add_edge(START, \"llm\")\n", + " # From LLM, decide whether to use tools or end\n", + " workflow.add_conditional_edges(\n", + " \"llm\",\n", + " should_continue,\n", + " {\"tools\": \"tools\", END: END}\n", + " )\n", + " # Tool execution flows back to LLM for final response\n", + " workflow.add_edge(\"tools\", \"llm\")\n", + " # Set up memory\n", + " memory = MemorySaver()\n", + " # Compile the graph\n", + " agent = workflow.compile(checkpointer=memory)\n", + " return agent" + ] + }, + { + "cell_type": "markdown", + "id": "0f19d4b7", + "metadata": {}, + "source": [ + "#### Instantiate the banking agent\n", + "\n", + "Now, we'll create an instance of the banking agent by calling the workflow creation function.\n", + "\n", + "This compiled agent is ready to process banking requests and will automatically select and use the appropriate tools based on user queries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aabb7842", + "metadata": {}, + "outputs": [], + "source": [ + "# Create the banking intelligent agent\n", + "banking_agent = create_banking_langgraph_agent()\n", + "\n", + "print(\"Banking LangGraph Agent Created Successfully!\")\n", + "print(\"\\nFeatures:\")\n", + "print(\" - Intelligent banking tool selection\")\n", + "print(\" - Comprehensive banking system prompt\")\n", + "print(\" - Streamlined workflow: LLM → Tools → Response\")\n", + "print(\" - Automatic tool parameter extraction\")\n", + "print(\" - Professional banking assistance\")" + ] + }, + { + "cell_type": "markdown", + "id": "cfd302bb", + "metadata": {}, + "source": [ + "### Integrate agent with ValidMind\n", + "\n", + "To integrate our LangGraph banking agent with ValidMind, we need to create a wrapper function that ValidMind can use to invoke the agent and extract the necessary information for testing and documentation, allowing ValidMind to run validation tests on the agent's behavior, tool usage, and responses." + ] + }, + { + "cell_type": "markdown", + "id": "e2540236", + "metadata": {}, + "source": [ + "#### Import ValidMind components\n", + "\n", + "We'll start with importing the necessary ValidMind components for integrating our agent:\n", + "\n", + "- `Prompt` from `validmind.models` for handling prompt-based model inputs\n", + "- `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list` from `validmind.scorers.llm.deepeval` for extracting and converting tool calls from agent outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67557905", + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.models import Prompt\n", + "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list" + ] + }, + { + "cell_type": "markdown", + "id": "c30dd6b1", + "metadata": {}, + "source": [ + "#### Create agent wrapper function\n", + "\n", + "We'll then create a wrapper function that:\n", + "\n", + "- Accepts input in ValidMind's expected format (with `input` and `session_id` fields)\n", + "- Invokes the banking agent with the proper state initialization\n", + "- Captures tool outputs and tool calls for evaluation\n", + "- Returns a standardized response format that includes the prediction, full output, tool messages, and tool call information\n", + "- Handles errors gracefully with fallback responses" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db1fcc20", + "metadata": {}, + "outputs": [], + "source": [ + "def banking_agent_fn(input):\n", + " \"\"\"\n", + " Invoke the banking agent with the given input.\n", + " \"\"\"\n", + " try:\n", + " # Initial state for banking agent\n", + " initial_state = {\n", + " \"user_input\": input[\"input\"],\n", + " \"messages\": [HumanMessage(content=input[\"input\"])],\n", + " \"session_id\": input[\"session_id\"],\n", + " \"context\": {}\n", + " }\n", + " session_config = {\"configurable\": {\"thread_id\": input[\"session_id\"]}}\n", + " result = banking_agent.invoke(initial_state, config=session_config)\n", + "\n", + " from utils import capture_tool_output_messages\n", + "\n", + " # Capture all tool outputs and metadata\n", + " captured_data = capture_tool_output_messages(result)\n", + " \n", + " # Access specific tool outputs, this will be used for RAGAS tests\n", + " tool_message = \"\"\n", + " for output in captured_data[\"tool_outputs\"]:\n", + " tool_message += output['content']\n", + " \n", + " tool_calls_found = []\n", + " messages = result['messages']\n", + " for message in messages:\n", + " if hasattr(message, 'tool_calls') and message.tool_calls:\n", + " for tool_call in message.tool_calls:\n", + " # Handle both dictionary and object formats\n", + " if isinstance(tool_call, dict):\n", + " tool_calls_found.append(tool_call['name'])\n", + " else:\n", + " # ToolCall object - use attribute access\n", + " tool_calls_found.append(tool_call.name)\n", + "\n", + "\n", + " return {\n", + " \"prediction\": result['messages'][-1].content[0]['text'],\n", + " \"output\": result,\n", + " \"tool_messages\": [tool_message],\n", + " # \"tool_calls\": tool_calls_found,\n", + " \"tool_called\": _convert_to_tool_call_list(extract_tool_calls_from_agent_output(result))\n", + " }\n", + " except Exception as e:\n", + " # Return a fallback response if the agent fails\n", + " error_message = f\"\"\"I apologize, but I encountered an error while processing your banking request: {str(e)}.\n", + " Please try rephrasing your question or contact support if the issue persists.\"\"\"\n", + " return {\n", + " \"prediction\": error_message, \n", + " \"output\": {\n", + " \"messages\": [HumanMessage(content=input[\"input\"]), SystemMessage(content=error_message)],\n", + " \"error\": str(e)\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "4ea44f1e", + "metadata": {}, + "source": [ + "#### Initialize the ValidMind model object\n", + "\n", + "\n", + "We'll also need to register the banking agent as a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data.\n", + "\n", + "You simply initialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model) that:\n", + "\n", + "- Associates the wrapper function with the model for prediction\n", + "- Stores the system prompt template for documentation\n", + "- Provides a unique `input_id` for tracking and identification\n", + "- Enables the agent to be used with ValidMind's testing and documentation features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4389e36", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the agent as a model\n", + "vm_banking_model = vm.init_model(\n", + " input_id=\"banking_agent_model\",\n", + " predict_fn=banking_agent_fn,\n", + " prompt=Prompt(template=system_context)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cd6eb68b", + "metadata": {}, + "source": [ + "#### Store the agent reference\n", + "\n", + "We'll also store a reference to the original banking agent object in the ValidMind model. This allows us to access the full agent functionality directly if needed, while still maintaining the wrapper function interface for ValidMind's testing framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e39d400", + "metadata": {}, + "outputs": [], + "source": [ + "# Add the banking agent to the vm model\n", + "vm_banking_model.model = banking_agent" + ] + }, + { + "cell_type": "markdown", + "id": "2db4b849", + "metadata": {}, + "source": [ + "#### Verify integration\n", + "\n", + "Let's confirm that the banking agent has been successfully integrated with ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59afbb6d", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Banking Agent Successfully Integrated with ValidMind!\")\n", + "print(f\"Model ID: {vm_banking_model.input_id}\")" + ] + }, + { + "cell_type": "markdown", + "id": "72041947", + "metadata": {}, + "source": [ + "## Running tests\n", + "\n", + "You run individual tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module.\n", + "\n", + "In this section, we'll run validation tests on both our defined system prompt as well as evaluation tests on our agent's performance." + ] + }, + { + "cell_type": "markdown", + "id": "af84f571", + "metadata": {}, + "source": [ + "### Validate the system prompt\n", + "\n", + "Let's get an initial sense of how well our defined system prompt meets a few best practices for prompt engineering.\n", + "\n", + "Passing in our agentic model as an input, the tests below rate the prompt on a scale of 1-10 against the following criteria:\n", + "\n", + "- **[Clarity](https://docs.validmind.ai/tests/prompt_validation/Clarity.html)** — How clearly the prompt states the task.\n", + "- **[Conciseness](https://docs.validmind.ai/tests/prompt_validation/Conciseness.html)** — How succinctly the prompt states the task.\n", + "- **[Delimitation](https://docs.validmind.ai/tests/prompt_validation/Delimitation.html)** — When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", + "- **[NegativeInstruction](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — Whether the prompt contains negative instructions.\n", + "- **[Specificity](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — How specific the prompt defines the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f52dceb1", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Clarity\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70d52333", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Conciseness\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aa89976", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Delimitation\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8630197e", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.NegativeInstruction\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3bd1038", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Specificity\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "99e70a96", + "metadata": {}, + "source": [ + "### Evaluate the banking agent\n", + "\n", + "After validating our system prompt, let's move on to evaluating the agent we built." + ] + }, + { + "cell_type": "markdown", + "id": "9035ae24", + "metadata": {}, + "source": [ + "#### Initialize the ValidMind datasets\n", + "\n", + "First, let's import our sample dataset ([banking_test_dataset.py](banking_test_dataset.py)), which we'll use to evaluate our agent's performance across different banking scenarios:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0620699", + "metadata": {}, + "outputs": [], + "source": [ + "from banking_test_dataset import banking_test_dataset" + ] + }, + { + "cell_type": "markdown", + "id": "22f93945", + "metadata": {}, + "source": [ + "The next step is to connect your data with a ValidMind `Dataset` object. **This step is always necessary every time you want to connect a dataset to documentation and produce test results through ValidMind,** but you only need to do it once per dataset.\n", + "\n", + "Initialize a ValidMind dataset object using the [`init_dataset` function](https://docs.validmind.ai/validmind/validmind.html#init_dataset) from the ValidMind (`vm`) module. For this example, we'll pass in the following arguments:\n", + "\n", + "- **`input_id`** — A unique identifier that allows tracking what inputs are used when running each individual test.\n", + "- **`dataset`** — The raw dataset that you want to provide as input to tests.\n", + "- **`text_column`** — The name of the column containing the text input data.\n", + "- **`target_column`** — A required argument if tests require access to true values. This is the name of the target column in the dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b90a7dfd", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset = vm.init_dataset(\n", + " input_id=\"banking_test_dataset\",\n", + " dataset=banking_test_dataset.sample(2),\n", + " text_column=\"input\",\n", + " target_column=\"possible_outputs\",\n", + ")\n", + "\n", + "print(\"Banking Test Dataset Initialized in ValidMind!\")\n", + "print(f\"Dataset ID: {vm_test_dataset.input_id}\")\n", + "print(f\"Dataset columns: {vm_test_dataset._df.columns}\")\n", + "vm_test_dataset._df.head(1)" + ] + }, + { + "cell_type": "markdown", + "id": "next-steps-24e25294-3e59-4982-92d0-a998cfbe3bb5", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "You can look at the output produced by the ValidMind Library right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation." + ] + }, + { + "cell_type": "markdown", + "id": "next-docs-7773f25a-ef1c-40d1-9b4b-f6cecdf7dc6c", + "metadata": {}, + "source": [ + "### Work with your model documentation\n", + "\n", + "1. From the **Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", + "\n", + "2. In the left sidebar that appears for your model, click **Documentation** under Documents.\n", + "\n", + "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)" + ] + }, + { + "cell_type": "markdown", + "id": "next-resources-779f3903-eb13-4242-9438-e0dfc1753d4d", + "metadata": {}, + "source": [ + "### Discover more learning resources\n", + "\n", + "We offer many interactive notebooks to help you document models:\n", + "\n", + "- [Run tests & test suites](https://docs.validmind.ai/guide/testing-overview.html)\n", + "- [Code samples](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-vm-362dbd51-d699-405b-b817-f95381861976", + "metadata": {}, + "source": [ + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "upgrade-show-2ce32ad2-2751-4b40-a1ec-161b178603b1", + "metadata": { + "vscode": { + "languageId": "plaintext" } + }, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-version-61359c61-7218-4a90-9f6f-70794c42d997", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-restart-c45548b7-aee3-4b96-b412-37c781071787", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-99bac407-03cd-4fe3-83c8-0a280d4caa64", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 5 + "language_info": { + "name": "python", + "version": "3.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } From 2df63b745ad9cb2cfc30adbcb27b2a2973346b7d Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 16:15:17 -0800 Subject: [PATCH 22/54] Save point --- .../agents/document_agentic_ai.ipynb | 2377 +++++++++-------- 1 file changed, 1203 insertions(+), 1174 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index f1614b6de..2ad70aa10 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1,1178 +1,1207 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "382caa31", - "metadata": {}, - "source": [ - "# Document an agentic AI system\n", - "\n", - "Build and document an agentic AI system with the ValidMind Library. Construct a LangGraph-based banking agent, prepare your datasets and agent for testing, run out-of-the-box and custom tests and log those test results to the ValidMind Platform.\n", - "\n", - "An _AI agent_ is an autonomous system that interprets inputs, selects from available tools or actions, and executes multi-step behaviors to achieve defined goals. In this notebook, the agent acts as a banking assistant that analyzes user requests and automatically selects and invokes the appropriate specialized banking tool to deliver accurate, compliant, and actionable responses.\n", - "\n", - "- This agent enables financial institutions to automate complex banking workflows where different customer requests require different specialized tools and knowledge bases.\n", - "- Effective validation of agentic AI systems reduces the risks of agents misinterpreting inputs, failing to extract required parameters, or producing incorrect assessments or actions — such as selecting the wrong tool.\n", - "\n", - "
For the LLM components in this notebook to function properly, you'll need access to OpenAI.\n", - "

\n", - "Before you continue, ensure that a valid OPENAI_API_KEY is set in your .env file.
" - ] - }, - { - "cell_type": "markdown", - "id": "about-intro-d367bebc-fcd0-46df-93ba-5f731f4b9ba9", - "metadata": {}, - "source": [ - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. \n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators." - ] - }, - { - "cell_type": "markdown", - "id": "about-begin-c1d3d399-73ca-4a88-9ca8-16603eab0751", - "metadata": {}, - "source": [ - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html)." - ] - }, - { - "cell_type": "markdown", - "id": "about-signup-65fed156-8e69-49eb-b04e-ae52c574d8e7", - "metadata": {}, - "source": [ - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
" - ] - }, - { - "cell_type": "markdown", - "id": "about-concepts-f49034ef-bfb8-4643-88dc-5647eb76968a", - "metadata": {}, - "source": [ - "### Key concepts\n", - "\n", - "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", - "\n", - "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Metrics**: A subset of tests that do not have thresholds. In the context of this notebook, metrics and tests can be thought of as interchangeable concepts.\n", - "\n", - "**Custom metrics**: Custom metrics are functions that you define to evaluate your model or dataset. These functions can be registered with the ValidMind Library to be used in the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Custom metrics can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", - "\n", - "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." - ] - }, - { - "cell_type": "markdown", - "id": "cell-d35096df-f3f8-4034-87fa-1973bbdd7b49-efbed180-2181-4735-83c9-7d12b69b9f4c", - "metadata": {}, - "source": [ - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "id": "install-library-20ef5375-d71d-47b1-b669-ed90cb3f0e90", - "metadata": {}, - "source": [ - "### Install the ValidMind Library\n", - "\n", - "
Recommended Python versions\n", - "

\n", - "Python 3.8 <= x <= 3.11
\n", - "\n", - "Let's begin by installing the ValidMind Library with large language model (LLM) support:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "install-python-94668f4a-272a-4d93-b7c2-da735557fd37", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q \"validmind[llm]\" \"langgraph==0.3.21\"" - ] - }, - { - "cell_type": "markdown", - "id": "install-initialize-13c50d00-6167-4147-a886-541d73129898", - "metadata": {}, - "source": [ - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "id": "install-register-ab56529b-d2f2-4942-b18e-603fb7342e5b", - "metadata": {}, - "source": [ - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook.\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "id": "install-template-6e8d6279-0edf-442a-bb63-50f7f89e60b3", - "metadata": {}, - "source": [ - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Agentic AI`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "id": "e4a16ffa", - "metadata": {}, - "source": [ - "
Can't select this template?\n", - "

\n", - "Your organization administrators may need to add it to your template library:\n", - "
" - ] - }, - { - "cell_type": "markdown", - "id": "install-snippet-468eb7c3-612b-4f14-96aa-dc5a3088448c", - "metadata": {}, - "source": [ - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "install-init-9b9b870b-01e8-428d-afbf-8e3d9771b3f4", - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "install-preview-4e403693-e030-4b31-9d39-74a6af8f470f", - "metadata": {}, - "source": [ - "#### Preview the documentation template\n", - "\n", - "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", - "\n", - "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "install-preview-template-b5149386-4ab1-41cb-9527-880573d91509", - "metadata": {}, - "outputs": [], - "source": [ - "vm.preview_template()" - ] - }, - { - "cell_type": "markdown", - "id": "ba45feba", - "metadata": {}, - "source": [ - "### Verify OpenAI API access\n", - "\n", - "Verify that a valid `OPENAI_API_KEY` is set in your `.env` file:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9684fde1", - "metadata": {}, - "outputs": [], - "source": [ - "# Load environment variables if using .env file\n", - "try:\n", - " from dotenv import load_dotenv\n", - " load_dotenv()\n", - "except ImportError:\n", - " print(\"dotenv not installed. Make sure OPENAI_API_KEY is set in your environment.\")" - ] - }, - { - "cell_type": "markdown", - "id": "679111bb", - "metadata": {}, - "source": [ - "### Initialize the Python environment\n", - "\n", - "Let's import all the necessary libraries to prepare for building our banking LangGraph agentic system:\n", - "\n", - "- **Standard libraries** for data handling and environment management.\n", - "- **pandas**, a Python library for data manipulation and analytics, as an alias. We'll also configure pandas to show all columns and all rows at full width for easier debugging and inspection.\n", - "- **LangChain** components for LLM integration and tool management.\n", - "- **LangGraph** for building stateful, multi-step agent workflows.\n", - "- **Banking tools** for specialized financial services as defined in [banking_tools.py](banking_tools.py)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a64a021", - "metadata": {}, - "outputs": [], - "source": [ - "# STANDARD LIBRARY IMPORTS\n", - "\n", - "# TypedDict: Defines type-safe dictionaries for the agent's state structure\n", - "# Annotated: Adds metadata to type hints\n", - "# Sequence: Type hint for sequences used in the agent\n", - "from typing import TypedDict, Annotated, Sequence\n", - "\n", - "# THIRD PARTY IMPORTS\n", - "\n", - "import pandas as pd\n", - "# Configure pandas to show all columns and all rows at full width\n", - "pd.set_option('display.max_columns', None)\n", - "pd.set_option('display.max_colwidth', None)\n", - "pd.set_option('display.width', None)\n", - "pd.set_option('display.max_rows', None)\n", - "\n", - "# BaseMessage: Represents a base message in the LangChain message system\n", - "# HumanMessage: Represents a human message in the LangChain message system\n", - "# SystemMessage: Represents a system message in the LangChain message system\n", - "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", - "\n", - "# ChatOpenAI: Represents an OpenAI chat model in the LangChain library\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "# MemorySaver: Represents a checkpoint for saving and restoring agent state\n", - "from langgraph.checkpoint.memory import MemorySaver\n", - "\n", - "# StateGraph: Represents a stateful graph in the LangGraph library\n", - "# END: Represents the end of a graph\n", - "# START: Represents the start of a graph\n", - "from langgraph.graph import StateGraph, END, START\n", - "\n", - "# add_messages: Adds messages to the state\n", - "from langgraph.graph.message import add_messages\n", - "\n", - "# ToolNode: Represents a tool node in the LangGraph library\n", - "from langgraph.prebuilt import ToolNode\n", - "\n", - "# LOCAL IMPORTS FROM banking_tools.py\n", - "\n", - "from banking_tools import AVAILABLE_TOOLS" - ] - }, - { - "cell_type": "markdown", - "id": "cf6ebc6c", - "metadata": {}, - "source": [ - "## Building the LangGraph agent" - ] - }, - { - "cell_type": "markdown", - "id": "bf4fc0d7", - "metadata": {}, - "source": [ - "### Test available banking tools\n", - "\n", - "We'll use the demo banking tools defined in `banking_tools.py` that provide use cases of financial services:\n", - "\n", - "- **Credit Risk Analyzer** - Loan applications and credit decisions\n", - "- **Customer Account Manager** - Account services and customer support\n", - "- **Fraud Detection System** - Security and fraud prevention" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c862fdd", - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Available tools: {len(AVAILABLE_TOOLS)}\")\n", - "print(\"\\nTool Details:\")\n", - "for i, tool in enumerate(AVAILABLE_TOOLS, 1):\n", - " print(f\" - {tool.name}\")" - ] - }, - { - "cell_type": "markdown", - "id": "4d6f0e26", - "metadata": {}, - "source": [ - "Let's test each banking tool individually to ensure they're working correctly before integrating them into our agent:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc0caff2", - "metadata": {}, - "outputs": [], - "source": [ - "# Test 1: Credit Risk Analyzer\n", - "print(\"TEST 1: Credit Risk Analyzer\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " # Access the underlying function using .func\n", - " credit_result = AVAILABLE_TOOLS[0].func(\n", - " customer_income=75000,\n", - " customer_debt=1200,\n", - " credit_score=720,\n", - " loan_amount=50000,\n", - " loan_type=\"personal\"\n", - " )\n", - " print(credit_result)\n", - " print(\"Credit Risk Analyzer test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Credit Risk Analyzer test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6b227db", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Test 2: Customer Account Manager\n", - "print(\"TEST 2: Customer Account Manager\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " # Test checking balance\n", - " account_result = AVAILABLE_TOOLS[1].func(\n", - " account_type=\"checking\",\n", - " customer_id=\"12345\",\n", - " action=\"check_balance\"\n", - " )\n", - " print(account_result)\n", - "\n", - " # Test getting account info\n", - " info_result = AVAILABLE_TOOLS[1].func(\n", - " account_type=\"all\",\n", - " customer_id=\"12345\", \n", - " action=\"get_info\"\n", - " )\n", - " print(info_result)\n", - " print(\"Customer Account Manager test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Customer Account Manager test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8442bf81", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Test 3: Fraud Detection System\n", - "print(\"TEST 3: Fraud Detection System\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " fraud_result = AVAILABLE_TOOLS[2].func(\n", - " transaction_id=\"TX123\",\n", - " customer_id=\"12345\",\n", - " transaction_amount=500.00,\n", - " transaction_type=\"withdrawal\",\n", - " location=\"Miami, FL\",\n", - " device_id=\"DEVICE_001\"\n", - " )\n", - " print(fraud_result)\n", - " print(\"Fraud Detection System test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Fraud Detection System test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)" - ] - }, - { - "cell_type": "markdown", - "id": "5ed83560", - "metadata": {}, - "source": [ - "### Create LangGraph banking agent\n", - "\n", - "With our tools ready to go, we'll create our intelligent banking agent with LangGraph that automatically selects and uses the appropriate banking tool based on a user request." - ] - }, - { - "cell_type": "markdown", - "id": "6a5beb28", - "metadata": {}, - "source": [ - "#### Define system prompt\n", - "\n", - "We'll begin by defining our system prompt, which provides the LLM with context about its role as a banking assistant and guidance on when to use each available tool:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64f46a1c", - "metadata": {}, - "outputs": [], - "source": [ - "# Enhanced banking system prompt with tool selection guidance\n", - "system_context = \"\"\"You are a professional banking AI assistant with access to specialized banking tools.\n", - " Analyze the user's banking request and directly use the most appropriate tools to help them.\n", - "\n", - " AVAILABLE BANKING TOOLS:\n", - "\n", - " credit_risk_analyzer - Analyze credit risk for loan applications and credit decisions\n", - " - Use for: loan applications, credit assessments, risk analysis, mortgage eligibility\n", - " - Examples: \"Analyze credit risk for $50k personal loan\", \"Assess mortgage eligibility for $300k home purchase\"\n", - " - Parameters: customer_income, customer_debt, credit_score, loan_amount, loan_type\n", - "\n", - " customer_account_manager - Manage customer accounts and provide banking services\n", - " - Use for: account information, transaction processing, product recommendations, customer service\n", - " - Examples: \"Check balance for checking account 12345\", \"Recommend products for customer with high balance\"\n", - " - Parameters: account_type, customer_id, action, amount, account_details\n", - "\n", - " fraud_detection_system - Analyze transactions for potential fraud and security risks\n", - " - Use for: transaction monitoring, fraud prevention, risk assessment, security alerts\n", - " - Examples: \"Analyze fraud risk for $500 ATM withdrawal in Miami\", \"Check security for $2000 online purchase\"\n", - " - Parameters: transaction_id, customer_id, transaction_amount, transaction_type, location, device_id\n", - "\n", - " BANKING INSTRUCTIONS:\n", - " - Analyze the user's banking request carefully and identify the primary need\n", - " - If they need credit analysis → use credit_risk_analyzer\n", - " - If they need financial calculations → use financial_calculator\n", - " - If they need account services → use customer_account_manager\n", - " - If they need security analysis → use fraud_detection_system\n", - " - Extract relevant parameters from the user's request\n", - " - Provide helpful, accurate banking responses based on tool outputs\n", - " - Always consider banking regulations, risk management, and best practices\n", - " - Be professional and thorough in your analysis\n", - "\n", - " Choose and use tools wisely to provide the most helpful banking assistance.\n", - " \"\"\"" - ] - }, - { - "cell_type": "markdown", - "id": "3f8da88d", - "metadata": {}, - "source": [ - "#### Initialize the LLM\n", - "\n", - "Let's initialize the LLM that will power our banking agent:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b828d70", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the main LLM for banking responses\n", - "main_llm = ChatOpenAI(\n", - " model=\"gpt-5-mini\",\n", - " reasoning={\n", - " \"effort\": \"low\",\n", - " \"summary\": \"auto\"\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "866b59cb", - "metadata": {}, - "source": [ - "Then bind the available banking tools to the LLM, enabling the model to automatically recognize and invoke each tool when appropriate based on request input and the system prompt we defined above:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65f85d86", - "metadata": {}, - "outputs": [], - "source": [ - "# Bind all banking tools to the main LLM\n", - "llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)" - ] - }, - { - "cell_type": "markdown", - "id": "5f898062", - "metadata": {}, - "source": [ - "#### Define agent state structure\n", - "\n", - "The agent state defines the data structure that flows through the LangGraph workflow. It includes:\n", - "\n", - "- **messages** — The conversation history between the user and agent\n", - "- **user_input** — The current user request\n", - "- **session_id** — A unique identifier for the conversation session\n", - "- **context** — Additional context that can be passed between nodes\n", - "\n", - "Defining this state structure maintains the structure throughout the agent's execution and allows for multi-turn conversations with memory:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7feeebb", - "metadata": {}, - "outputs": [], - "source": [ - "# Banking Agent State Definition\n", - "class BankingAgentState(TypedDict):\n", - " messages: Annotated[Sequence[BaseMessage], add_messages]\n", - " user_input: str\n", - " session_id: str\n", - " context: dict" - ] - }, - { - "cell_type": "markdown", - "id": "31b261b2", - "metadata": {}, - "source": [ - "#### Create agent workflow function\n", - "\n", - "We'll build the LangGraph agent workflow with two main components:\n", - "\n", - "1. **LLM node** — Processes user requests, applies the system prompt, and decides whether to use tools.\n", - "2. **Tools node** — Executes the selected banking tools when the LLM determines they're needed.\n", - "\n", - "The workflow begins with the LLM analyzing the request, then uses tools if needed — or ends if the response is complete, and finally returns to the LLM to generate the final response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "142c20b9", - "metadata": {}, - "outputs": [], - "source": [ - "def create_banking_langgraph_agent():\n", - " \"\"\"Create a comprehensive LangGraph banking agent with intelligent tool selection.\"\"\"\n", - " def llm_node(state: BankingAgentState) -> BankingAgentState:\n", - " \"\"\"Main LLM node that processes banking requests and selects appropriate tools.\"\"\"\n", - " messages = state[\"messages\"]\n", - " # Add system context to messages\n", - " enhanced_messages = [SystemMessage(content=system_context)] + list(messages)\n", - " # Get LLM response with tool selection\n", - " response = llm_with_tools.invoke(enhanced_messages)\n", - " return {\n", - " **state,\n", - " \"messages\": messages + [response]\n", - " }\n", - " \n", - " def should_continue(state: BankingAgentState) -> str:\n", - " \"\"\"Decide whether to use tools or end the conversation.\"\"\"\n", - " last_message = state[\"messages\"][-1]\n", - " # Check if the LLM wants to use tools\n", - " if hasattr(last_message, 'tool_calls') and last_message.tool_calls:\n", - " return \"tools\"\n", - " return END\n", - " \n", - " # Create the banking state graph\n", - " workflow = StateGraph(BankingAgentState)\n", - " # Add nodes\n", - " workflow.add_node(\"llm\", llm_node)\n", - " workflow.add_node(\"tools\", ToolNode(AVAILABLE_TOOLS))\n", - " # Simplified entry point - go directly to LLM\n", - " workflow.add_edge(START, \"llm\")\n", - " # From LLM, decide whether to use tools or end\n", - " workflow.add_conditional_edges(\n", - " \"llm\",\n", - " should_continue,\n", - " {\"tools\": \"tools\", END: END}\n", - " )\n", - " # Tool execution flows back to LLM for final response\n", - " workflow.add_edge(\"tools\", \"llm\")\n", - " # Set up memory\n", - " memory = MemorySaver()\n", - " # Compile the graph\n", - " agent = workflow.compile(checkpointer=memory)\n", - " return agent" - ] - }, - { - "cell_type": "markdown", - "id": "0f19d4b7", - "metadata": {}, - "source": [ - "#### Instantiate the banking agent\n", - "\n", - "Now, we'll create an instance of the banking agent by calling the workflow creation function.\n", - "\n", - "This compiled agent is ready to process banking requests and will automatically select and use the appropriate tools based on user queries:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aabb7842", - "metadata": {}, - "outputs": [], - "source": [ - "# Create the banking intelligent agent\n", - "banking_agent = create_banking_langgraph_agent()\n", - "\n", - "print(\"Banking LangGraph Agent Created Successfully!\")\n", - "print(\"\\nFeatures:\")\n", - "print(\" - Intelligent banking tool selection\")\n", - "print(\" - Comprehensive banking system prompt\")\n", - "print(\" - Streamlined workflow: LLM → Tools → Response\")\n", - "print(\" - Automatic tool parameter extraction\")\n", - "print(\" - Professional banking assistance\")" - ] - }, - { - "cell_type": "markdown", - "id": "cfd302bb", - "metadata": {}, - "source": [ - "### Integrate agent with ValidMind\n", - "\n", - "To integrate our LangGraph banking agent with ValidMind, we need to create a wrapper function that ValidMind can use to invoke the agent and extract the necessary information for testing and documentation, allowing ValidMind to run validation tests on the agent's behavior, tool usage, and responses." - ] - }, - { - "cell_type": "markdown", - "id": "e2540236", - "metadata": {}, - "source": [ - "#### Import ValidMind components\n", - "\n", - "We'll start with importing the necessary ValidMind components for integrating our agent:\n", - "\n", - "- `Prompt` from `validmind.models` for handling prompt-based model inputs\n", - "- `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list` from `validmind.scorers.llm.deepeval` for extracting and converting tool calls from agent outputs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67557905", - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.models import Prompt\n", - "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list" - ] - }, - { - "cell_type": "markdown", - "id": "c30dd6b1", - "metadata": {}, - "source": [ - "#### Create agent wrapper function\n", - "\n", - "We'll then create a wrapper function that:\n", - "\n", - "- Accepts input in ValidMind's expected format (with `input` and `session_id` fields)\n", - "- Invokes the banking agent with the proper state initialization\n", - "- Captures tool outputs and tool calls for evaluation\n", - "- Returns a standardized response format that includes the prediction, full output, tool messages, and tool call information\n", - "- Handles errors gracefully with fallback responses" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db1fcc20", - "metadata": {}, - "outputs": [], - "source": [ - "def banking_agent_fn(input):\n", - " \"\"\"\n", - " Invoke the banking agent with the given input.\n", - " \"\"\"\n", - " try:\n", - " # Initial state for banking agent\n", - " initial_state = {\n", - " \"user_input\": input[\"input\"],\n", - " \"messages\": [HumanMessage(content=input[\"input\"])],\n", - " \"session_id\": input[\"session_id\"],\n", - " \"context\": {}\n", - " }\n", - " session_config = {\"configurable\": {\"thread_id\": input[\"session_id\"]}}\n", - " result = banking_agent.invoke(initial_state, config=session_config)\n", - "\n", - " from utils import capture_tool_output_messages\n", - "\n", - " # Capture all tool outputs and metadata\n", - " captured_data = capture_tool_output_messages(result)\n", - " \n", - " # Access specific tool outputs, this will be used for RAGAS tests\n", - " tool_message = \"\"\n", - " for output in captured_data[\"tool_outputs\"]:\n", - " tool_message += output['content']\n", - " \n", - " tool_calls_found = []\n", - " messages = result['messages']\n", - " for message in messages:\n", - " if hasattr(message, 'tool_calls') and message.tool_calls:\n", - " for tool_call in message.tool_calls:\n", - " # Handle both dictionary and object formats\n", - " if isinstance(tool_call, dict):\n", - " tool_calls_found.append(tool_call['name'])\n", - " else:\n", - " # ToolCall object - use attribute access\n", - " tool_calls_found.append(tool_call.name)\n", - "\n", - "\n", - " return {\n", - " \"prediction\": result['messages'][-1].content[0]['text'],\n", - " \"output\": result,\n", - " \"tool_messages\": [tool_message],\n", - " # \"tool_calls\": tool_calls_found,\n", - " \"tool_called\": _convert_to_tool_call_list(extract_tool_calls_from_agent_output(result))\n", - " }\n", - " except Exception as e:\n", - " # Return a fallback response if the agent fails\n", - " error_message = f\"\"\"I apologize, but I encountered an error while processing your banking request: {str(e)}.\n", - " Please try rephrasing your question or contact support if the issue persists.\"\"\"\n", - " return {\n", - " \"prediction\": error_message, \n", - " \"output\": {\n", - " \"messages\": [HumanMessage(content=input[\"input\"]), SystemMessage(content=error_message)],\n", - " \"error\": str(e)\n", - " }\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "4ea44f1e", - "metadata": {}, - "source": [ - "#### Initialize the ValidMind model object\n", - "\n", - "\n", - "We'll also need to register the banking agent as a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data.\n", - "\n", - "You simply initialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model) that:\n", - "\n", - "- Associates the wrapper function with the model for prediction\n", - "- Stores the system prompt template for documentation\n", - "- Provides a unique `input_id` for tracking and identification\n", - "- Enables the agent to be used with ValidMind's testing and documentation features" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4389e36", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the agent as a model\n", - "vm_banking_model = vm.init_model(\n", - " input_id=\"banking_agent_model\",\n", - " predict_fn=banking_agent_fn,\n", - " prompt=Prompt(template=system_context)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "cd6eb68b", - "metadata": {}, - "source": [ - "#### Store the agent reference\n", - "\n", - "We'll also store a reference to the original banking agent object in the ValidMind model. This allows us to access the full agent functionality directly if needed, while still maintaining the wrapper function interface for ValidMind's testing framework." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e39d400", - "metadata": {}, - "outputs": [], - "source": [ - "# Add the banking agent to the vm model\n", - "vm_banking_model.model = banking_agent" - ] - }, - { - "cell_type": "markdown", - "id": "2db4b849", - "metadata": {}, - "source": [ - "#### Verify integration\n", - "\n", - "Let's confirm that the banking agent has been successfully integrated with ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59afbb6d", - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Banking Agent Successfully Integrated with ValidMind!\")\n", - "print(f\"Model ID: {vm_banking_model.input_id}\")" - ] - }, - { - "cell_type": "markdown", - "id": "72041947", - "metadata": {}, - "source": [ - "## Running tests\n", - "\n", - "You run individual tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module.\n", - "\n", - "In this section, we'll run validation tests on both our defined system prompt as well as evaluation tests on our agent's performance." - ] - }, - { - "cell_type": "markdown", - "id": "af84f571", - "metadata": {}, - "source": [ - "### Validate the system prompt\n", - "\n", - "Let's get an initial sense of how well our defined system prompt meets a few best practices for prompt engineering.\n", - "\n", - "Passing in our agentic model as an input, the tests below rate the prompt on a scale of 1-10 against the following criteria:\n", - "\n", - "- **[Clarity](https://docs.validmind.ai/tests/prompt_validation/Clarity.html)** — How clearly the prompt states the task.\n", - "- **[Conciseness](https://docs.validmind.ai/tests/prompt_validation/Conciseness.html)** — How succinctly the prompt states the task.\n", - "- **[Delimitation](https://docs.validmind.ai/tests/prompt_validation/Delimitation.html)** — When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", - "- **[NegativeInstruction](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — Whether the prompt contains negative instructions.\n", - "- **[Specificity](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — How specific the prompt defines the task." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f52dceb1", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Clarity\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "70d52333", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Conciseness\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5aa89976", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Delimitation\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8630197e", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.NegativeInstruction\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3bd1038", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Specificity\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "99e70a96", - "metadata": {}, - "source": [ - "### Evaluate the banking agent\n", - "\n", - "After validating our system prompt, let's move on to evaluating the agent we built." - ] - }, - { - "cell_type": "markdown", - "id": "9035ae24", - "metadata": {}, - "source": [ - "#### Initialize the ValidMind datasets\n", - "\n", - "First, let's import our sample dataset ([banking_test_dataset.py](banking_test_dataset.py)), which we'll use to evaluate our agent's performance across different banking scenarios:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0620699", - "metadata": {}, - "outputs": [], - "source": [ - "from banking_test_dataset import banking_test_dataset" - ] - }, - { - "cell_type": "markdown", - "id": "22f93945", - "metadata": {}, - "source": [ - "The next step is to connect your data with a ValidMind `Dataset` object. **This step is always necessary every time you want to connect a dataset to documentation and produce test results through ValidMind,** but you only need to do it once per dataset.\n", - "\n", - "Initialize a ValidMind dataset object using the [`init_dataset` function](https://docs.validmind.ai/validmind/validmind.html#init_dataset) from the ValidMind (`vm`) module. For this example, we'll pass in the following arguments:\n", - "\n", - "- **`input_id`** — A unique identifier that allows tracking what inputs are used when running each individual test.\n", - "- **`dataset`** — The raw dataset that you want to provide as input to tests.\n", - "- **`text_column`** — The name of the column containing the text input data.\n", - "- **`target_column`** — A required argument if tests require access to true values. This is the name of the target column in the dataset.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b90a7dfd", - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset = vm.init_dataset(\n", - " input_id=\"banking_test_dataset\",\n", - " dataset=banking_test_dataset.sample(2),\n", - " text_column=\"input\",\n", - " target_column=\"possible_outputs\",\n", - ")\n", - "\n", - "print(\"Banking Test Dataset Initialized in ValidMind!\")\n", - "print(f\"Dataset ID: {vm_test_dataset.input_id}\")\n", - "print(f\"Dataset columns: {vm_test_dataset._df.columns}\")\n", - "vm_test_dataset._df.head(1)" - ] - }, - { - "cell_type": "markdown", - "id": "next-steps-24e25294-3e59-4982-92d0-a998cfbe3bb5", - "metadata": {}, - "source": [ - "## Next steps\n", - "\n", - "You can look at the output produced by the ValidMind Library right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation." - ] - }, - { - "cell_type": "markdown", - "id": "next-docs-7773f25a-ef1c-40d1-9b4b-f6cecdf7dc6c", - "metadata": {}, - "source": [ - "### Work with your model documentation\n", - "\n", - "1. From the **Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", - "\n", - "2. In the left sidebar that appears for your model, click **Documentation** under Documents.\n", - "\n", - "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)" - ] - }, - { - "cell_type": "markdown", - "id": "next-resources-779f3903-eb13-4242-9438-e0dfc1753d4d", - "metadata": {}, - "source": [ - "### Discover more learning resources\n", - "\n", - "We offer many interactive notebooks to help you document models:\n", - "\n", - "- [Run tests & test suites](https://docs.validmind.ai/guide/testing-overview.html)\n", - "- [Code samples](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", - "\n", - "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." - ] - }, - { - "cell_type": "markdown", - "id": "upgrade-vm-362dbd51-d699-405b-b817-f95381861976", - "metadata": {}, - "source": [ - "## Upgrade ValidMind\n", - "\n", - "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", - "\n", - "Retrieve the information for the currently installed version of ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "upgrade-show-2ce32ad2-2751-4b40-a1ec-161b178603b1", - "metadata": { - "vscode": { - "languageId": "plaintext" + "cells": [ + { + "cell_type": "markdown", + "id": "382caa31", + "metadata": {}, + "source": [ + "# Document an agentic AI system\n", + "\n", + "Build and document an agentic AI system with the ValidMind Library. Construct a LangGraph-based banking agent, prepare your datasets and agent for testing, run out-of-the-box and custom tests and log those test results to the ValidMind Platform.\n", + "\n", + "An _AI agent_ is an autonomous system that interprets inputs, selects from available tools or actions, and executes multi-step behaviors to achieve defined goals. In this notebook, the agent acts as a banking assistant that analyzes user requests and automatically selects and invokes the appropriate specialized banking tool to deliver accurate, compliant, and actionable responses.\n", + "\n", + "- This agent enables financial institutions to automate complex banking workflows where different customer requests require different specialized tools and knowledge bases.\n", + "- Effective validation of agentic AI systems reduces the risks of agents misinterpreting inputs, failing to extract required parameters, or producing incorrect assessments or actions — such as selecting the wrong tool.\n", + "\n", + "
For the LLM components in this notebook to function properly, you'll need access to OpenAI.\n", + "

\n", + "Before you continue, ensure that a valid OPENAI_API_KEY is set in your .env file.
" + ] + }, + { + "cell_type": "markdown", + "id": "about-intro-d367bebc-fcd0-46df-93ba-5f731f4b9ba9", + "metadata": {}, + "source": [ + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. \n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators." + ] + }, + { + "cell_type": "markdown", + "id": "about-begin-c1d3d399-73ca-4a88-9ca8-16603eab0751", + "metadata": {}, + "source": [ + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html)." + ] + }, + { + "cell_type": "markdown", + "id": "about-signup-65fed156-8e69-49eb-b04e-ae52c574d8e7", + "metadata": {}, + "source": [ + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
" + ] + }, + { + "cell_type": "markdown", + "id": "about-concepts-f49034ef-bfb8-4643-88dc-5647eb76968a", + "metadata": {}, + "source": [ + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Metrics**: A subset of tests that do not have thresholds. In the context of this notebook, metrics and tests can be thought of as interchangeable concepts.\n", + "\n", + "**Custom metrics**: Custom metrics are functions that you define to evaluate your model or dataset. These functions can be registered with the ValidMind Library to be used in the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom metrics can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "id": "cell-d35096df-f3f8-4034-87fa-1973bbdd7b49-efbed180-2181-4735-83c9-7d12b69b9f4c", + "metadata": {}, + "source": [ + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "id": "install-library-20ef5375-d71d-47b1-b669-ed90cb3f0e90", + "metadata": {}, + "source": [ + "### Install the ValidMind Library\n", + "\n", + "
Recommended Python versions\n", + "

\n", + "Python 3.8 <= x <= 3.11
\n", + "\n", + "Let's begin by installing the ValidMind Library with large language model (LLM) support:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-python-94668f4a-272a-4d93-b7c2-da735557fd37", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q \"validmind[llm]\" \"langgraph==0.3.21\"" + ] + }, + { + "cell_type": "markdown", + "id": "install-initialize-13c50d00-6167-4147-a886-541d73129898", + "metadata": {}, + "source": [ + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "id": "install-register-ab56529b-d2f2-4942-b18e-603fb7342e5b", + "metadata": {}, + "source": [ + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook.\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "id": "install-template-6e8d6279-0edf-442a-bb63-50f7f89e60b3", + "metadata": {}, + "source": [ + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Agentic AI`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "id": "e4a16ffa", + "metadata": {}, + "source": [ + "
Can't select this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "install-snippet-468eb7c3-612b-4f14-96aa-dc5a3088448c", + "metadata": {}, + "source": [ + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-init-9b9b870b-01e8-428d-afbf-8e3d9771b3f4", + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "install-preview-4e403693-e030-4b31-9d39-74a6af8f470f", + "metadata": {}, + "source": [ + "#### Preview the documentation template\n", + "\n", + "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", + "\n", + "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-preview-template-b5149386-4ab1-41cb-9527-880573d91509", + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "id": "ba45feba", + "metadata": {}, + "source": [ + "### Verify OpenAI API access\n", + "\n", + "Verify that a valid `OPENAI_API_KEY` is set in your `.env` file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9684fde1", + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables if using .env file\n", + "try:\n", + " from dotenv import load_dotenv\n", + " load_dotenv()\n", + "except ImportError:\n", + " print(\"dotenv not installed. Make sure OPENAI_API_KEY is set in your environment.\")" + ] + }, + { + "cell_type": "markdown", + "id": "679111bb", + "metadata": {}, + "source": [ + "### Initialize the Python environment\n", + "\n", + "Let's import all the necessary libraries to prepare for building our banking LangGraph agentic system:\n", + "\n", + "- **Standard libraries** for data handling and environment management.\n", + "- **pandas**, a Python library for data manipulation and analytics, as an alias. We'll also configure pandas to show all columns and all rows at full width for easier debugging and inspection.\n", + "- **LangChain** components for LLM integration and tool management.\n", + "- **LangGraph** for building stateful, multi-step agent workflows.\n", + "- **Banking tools** for specialized financial services as defined in [banking_tools.py](banking_tools.py)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a64a021", + "metadata": {}, + "outputs": [], + "source": [ + "# STANDARD LIBRARY IMPORTS\n", + "\n", + "# TypedDict: Defines type-safe dictionaries for the agent's state structure\n", + "# Annotated: Adds metadata to type hints\n", + "# Sequence: Type hint for sequences used in the agent\n", + "from typing import TypedDict, Annotated, Sequence\n", + "\n", + "# THIRD PARTY IMPORTS\n", + "\n", + "import pandas as pd\n", + "# Configure pandas to show all columns and all rows at full width\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.width', None)\n", + "pd.set_option('display.max_rows', None)\n", + "\n", + "# BaseMessage: Represents a base message in the LangChain message system\n", + "# HumanMessage: Represents a human message in the LangChain message system\n", + "# SystemMessage: Represents a system message in the LangChain message system\n", + "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", + "\n", + "# ChatOpenAI: Represents an OpenAI chat model in the LangChain library\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "# MemorySaver: Represents a checkpoint for saving and restoring agent state\n", + "from langgraph.checkpoint.memory import MemorySaver\n", + "\n", + "# StateGraph: Represents a stateful graph in the LangGraph library\n", + "# END: Represents the end of a graph\n", + "# START: Represents the start of a graph\n", + "from langgraph.graph import StateGraph, END, START\n", + "\n", + "# add_messages: Adds messages to the state\n", + "from langgraph.graph.message import add_messages\n", + "\n", + "# ToolNode: Represents a tool node in the LangGraph library\n", + "from langgraph.prebuilt import ToolNode\n", + "\n", + "# LOCAL IMPORTS FROM banking_tools.py\n", + "\n", + "from banking_tools import AVAILABLE_TOOLS" + ] + }, + { + "cell_type": "markdown", + "id": "cf6ebc6c", + "metadata": {}, + "source": [ + "## Building the LangGraph agent" + ] + }, + { + "cell_type": "markdown", + "id": "bf4fc0d7", + "metadata": {}, + "source": [ + "### Test available banking tools\n", + "\n", + "We'll use the demo banking tools defined in `banking_tools.py` that provide use cases of financial services:\n", + "\n", + "- **Credit Risk Analyzer** - Loan applications and credit decisions\n", + "- **Customer Account Manager** - Account services and customer support\n", + "- **Fraud Detection System** - Security and fraud prevention" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c862fdd", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Available tools: {len(AVAILABLE_TOOLS)}\")\n", + "print(\"\\nTool Details:\")\n", + "for i, tool in enumerate(AVAILABLE_TOOLS, 1):\n", + " print(f\" - {tool.name}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4d6f0e26", + "metadata": {}, + "source": [ + "Let's test each banking tool individually to ensure they're working correctly before integrating them into our agent:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc0caff2", + "metadata": {}, + "outputs": [], + "source": [ + "# Test 1: Credit Risk Analyzer\n", + "print(\"TEST 1: Credit Risk Analyzer\")\n", + "print(\"-\" * 40)\n", + "try:\n", + " # Access the underlying function using .func\n", + " credit_result = AVAILABLE_TOOLS[0].func(\n", + " customer_income=75000,\n", + " customer_debt=1200,\n", + " credit_score=720,\n", + " loan_amount=50000,\n", + " loan_type=\"personal\"\n", + " )\n", + " print(credit_result)\n", + " print(\"Credit Risk Analyzer test PASSED\")\n", + "except Exception as e:\n", + " print(f\"Credit Risk Analyzer test FAILED: {e}\")\n", + "\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6b227db", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Test 2: Customer Account Manager\n", + "print(\"TEST 2: Customer Account Manager\")\n", + "print(\"-\" * 40)\n", + "try:\n", + " # Test checking balance\n", + " account_result = AVAILABLE_TOOLS[1].func(\n", + " account_type=\"checking\",\n", + " customer_id=\"12345\",\n", + " action=\"check_balance\"\n", + " )\n", + " print(account_result)\n", + "\n", + " # Test getting account info\n", + " info_result = AVAILABLE_TOOLS[1].func(\n", + " account_type=\"all\",\n", + " customer_id=\"12345\", \n", + " action=\"get_info\"\n", + " )\n", + " print(info_result)\n", + " print(\"Customer Account Manager test PASSED\")\n", + "except Exception as e:\n", + " print(f\"Customer Account Manager test FAILED: {e}\")\n", + "\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8442bf81", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Test 3: Fraud Detection System\n", + "print(\"TEST 3: Fraud Detection System\")\n", + "print(\"-\" * 40)\n", + "try:\n", + " fraud_result = AVAILABLE_TOOLS[2].func(\n", + " transaction_id=\"TX123\",\n", + " customer_id=\"12345\",\n", + " transaction_amount=500.00,\n", + " transaction_type=\"withdrawal\",\n", + " location=\"Miami, FL\",\n", + " device_id=\"DEVICE_001\"\n", + " )\n", + " print(fraud_result)\n", + " print(\"Fraud Detection System test PASSED\")\n", + "except Exception as e:\n", + " print(f\"Fraud Detection System test FAILED: {e}\")\n", + "\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "markdown", + "id": "5ed83560", + "metadata": {}, + "source": [ + "### Create LangGraph banking agent\n", + "\n", + "With our tools ready to go, we'll create our intelligent banking agent with LangGraph that automatically selects and uses the appropriate banking tool based on a user request." + ] + }, + { + "cell_type": "markdown", + "id": "6a5beb28", + "metadata": {}, + "source": [ + "#### Define system prompt\n", + "\n", + "We'll begin by defining our system prompt, which provides the LLM with context about its role as a banking assistant and guidance on when to use each available tool:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64f46a1c", + "metadata": {}, + "outputs": [], + "source": [ + "# Enhanced banking system prompt with tool selection guidance\n", + "system_context = \"\"\"You are a professional banking AI assistant with access to specialized banking tools.\n", + " Analyze the user's banking request and directly use the most appropriate tools to help them.\n", + "\n", + " AVAILABLE BANKING TOOLS:\n", + "\n", + " credit_risk_analyzer - Analyze credit risk for loan applications and credit decisions\n", + " - Use for: loan applications, credit assessments, risk analysis, mortgage eligibility\n", + " - Examples: \"Analyze credit risk for $50k personal loan\", \"Assess mortgage eligibility for $300k home purchase\"\n", + " - Parameters: customer_income, customer_debt, credit_score, loan_amount, loan_type\n", + "\n", + " customer_account_manager - Manage customer accounts and provide banking services\n", + " - Use for: account information, transaction processing, product recommendations, customer service\n", + " - Examples: \"Check balance for checking account 12345\", \"Recommend products for customer with high balance\"\n", + " - Parameters: account_type, customer_id, action, amount, account_details\n", + "\n", + " fraud_detection_system - Analyze transactions for potential fraud and security risks\n", + " - Use for: transaction monitoring, fraud prevention, risk assessment, security alerts\n", + " - Examples: \"Analyze fraud risk for $500 ATM withdrawal in Miami\", \"Check security for $2000 online purchase\"\n", + " - Parameters: transaction_id, customer_id, transaction_amount, transaction_type, location, device_id\n", + "\n", + " BANKING INSTRUCTIONS:\n", + " - Analyze the user's banking request carefully and identify the primary need\n", + " - If they need credit analysis → use credit_risk_analyzer\n", + " - If they need financial calculations → use financial_calculator\n", + " - If they need account services → use customer_account_manager\n", + " - If they need security analysis → use fraud_detection_system\n", + " - Extract relevant parameters from the user's request\n", + " - Provide helpful, accurate banking responses based on tool outputs\n", + " - Always consider banking regulations, risk management, and best practices\n", + " - Be professional and thorough in your analysis\n", + "\n", + " Choose and use tools wisely to provide the most helpful banking assistance.\n", + " \"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "3f8da88d", + "metadata": {}, + "source": [ + "#### Initialize the LLM\n", + "\n", + "Let's initialize the LLM that will power our banking agent:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b828d70", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the main LLM for banking responses\n", + "main_llm = ChatOpenAI(\n", + " model=\"gpt-5-mini\",\n", + " reasoning={\n", + " \"effort\": \"low\",\n", + " \"summary\": \"auto\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "866b59cb", + "metadata": {}, + "source": [ + "Then bind the available banking tools to the LLM, enabling the model to automatically recognize and invoke each tool when appropriate based on request input and the system prompt we defined above:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65f85d86", + "metadata": {}, + "outputs": [], + "source": [ + "# Bind all banking tools to the main LLM\n", + "llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)" + ] + }, + { + "cell_type": "markdown", + "id": "5f898062", + "metadata": {}, + "source": [ + "#### Define agent state structure\n", + "\n", + "The agent state defines the data structure that flows through the LangGraph workflow. It includes:\n", + "\n", + "- **messages** — The conversation history between the user and agent\n", + "- **user_input** — The current user request\n", + "- **session_id** — A unique identifier for the conversation session\n", + "- **context** — Additional context that can be passed between nodes\n", + "\n", + "Defining this state structure maintains the structure throughout the agent's execution and allows for multi-turn conversations with memory:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7feeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# Banking Agent State Definition\n", + "class BankingAgentState(TypedDict):\n", + " messages: Annotated[Sequence[BaseMessage], add_messages]\n", + " user_input: str\n", + " session_id: str\n", + " context: dict" + ] + }, + { + "cell_type": "markdown", + "id": "31b261b2", + "metadata": {}, + "source": [ + "#### Create agent workflow function\n", + "\n", + "We'll build the LangGraph agent workflow with two main components:\n", + "\n", + "1. **LLM node** — Processes user requests, applies the system prompt, and decides whether to use tools.\n", + "2. **Tools node** — Executes the selected banking tools when the LLM determines they're needed.\n", + "\n", + "The workflow begins with the LLM analyzing the request, then uses tools if needed — or ends if the response is complete, and finally returns to the LLM to generate the final response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "142c20b9", + "metadata": {}, + "outputs": [], + "source": [ + "def create_banking_langgraph_agent():\n", + " \"\"\"Create a comprehensive LangGraph banking agent with intelligent tool selection.\"\"\"\n", + " def llm_node(state: BankingAgentState) -> BankingAgentState:\n", + " \"\"\"Main LLM node that processes banking requests and selects appropriate tools.\"\"\"\n", + " messages = state[\"messages\"]\n", + " # Add system context to messages\n", + " enhanced_messages = [SystemMessage(content=system_context)] + list(messages)\n", + " # Get LLM response with tool selection\n", + " response = llm_with_tools.invoke(enhanced_messages)\n", + " return {\n", + " **state,\n", + " \"messages\": messages + [response]\n", + " }\n", + " \n", + " def should_continue(state: BankingAgentState) -> str:\n", + " \"\"\"Decide whether to use tools or end the conversation.\"\"\"\n", + " last_message = state[\"messages\"][-1]\n", + " # Check if the LLM wants to use tools\n", + " if hasattr(last_message, 'tool_calls') and last_message.tool_calls:\n", + " return \"tools\"\n", + " return END\n", + " \n", + " # Create the banking state graph\n", + " workflow = StateGraph(BankingAgentState)\n", + " # Add nodes\n", + " workflow.add_node(\"llm\", llm_node)\n", + " workflow.add_node(\"tools\", ToolNode(AVAILABLE_TOOLS))\n", + " # Simplified entry point - go directly to LLM\n", + " workflow.add_edge(START, \"llm\")\n", + " # From LLM, decide whether to use tools or end\n", + " workflow.add_conditional_edges(\n", + " \"llm\",\n", + " should_continue,\n", + " {\"tools\": \"tools\", END: END}\n", + " )\n", + " # Tool execution flows back to LLM for final response\n", + " workflow.add_edge(\"tools\", \"llm\")\n", + " # Set up memory\n", + " memory = MemorySaver()\n", + " # Compile the graph\n", + " agent = workflow.compile(checkpointer=memory)\n", + " return agent" + ] + }, + { + "cell_type": "markdown", + "id": "0f19d4b7", + "metadata": {}, + "source": [ + "#### Instantiate the banking agent\n", + "\n", + "Now, we'll create an instance of the banking agent by calling the workflow creation function.\n", + "\n", + "This compiled agent is ready to process banking requests and will automatically select and use the appropriate tools based on user queries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aabb7842", + "metadata": {}, + "outputs": [], + "source": [ + "# Create the banking intelligent agent\n", + "banking_agent = create_banking_langgraph_agent()\n", + "\n", + "print(\"Banking LangGraph Agent Created Successfully!\")\n", + "print(\"\\nFeatures:\")\n", + "print(\" - Intelligent banking tool selection\")\n", + "print(\" - Comprehensive banking system prompt\")\n", + "print(\" - Streamlined workflow: LLM → Tools → Response\")\n", + "print(\" - Automatic tool parameter extraction\")\n", + "print(\" - Professional banking assistance\")" + ] + }, + { + "cell_type": "markdown", + "id": "cfd302bb", + "metadata": {}, + "source": [ + "### Integrate agent with ValidMind\n", + "\n", + "To integrate our LangGraph banking agent with ValidMind, we need to create a wrapper function that ValidMind can use to invoke the agent and extract the necessary information for testing and documentation, allowing ValidMind to run validation tests on the agent's behavior, tool usage, and responses." + ] + }, + { + "cell_type": "markdown", + "id": "e2540236", + "metadata": {}, + "source": [ + "#### Import ValidMind components\n", + "\n", + "We'll start with importing the necessary ValidMind components for integrating our agent:\n", + "\n", + "- `Prompt` from `validmind.models` for handling prompt-based model inputs\n", + "- `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list` from `validmind.scorers.llm.deepeval` for extracting and converting tool calls from agent outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67557905", + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.models import Prompt\n", + "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list" + ] + }, + { + "cell_type": "markdown", + "id": "c30dd6b1", + "metadata": {}, + "source": [ + "#### Create agent wrapper function\n", + "\n", + "We'll then create a wrapper function that:\n", + "\n", + "- Accepts input in ValidMind's expected format (with `input` and `session_id` fields)\n", + "- Invokes the banking agent with the proper state initialization\n", + "- Captures tool outputs and tool calls for evaluation\n", + "- Returns a standardized response format that includes the prediction, full output, tool messages, and tool call information\n", + "- Handles errors gracefully with fallback responses" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db1fcc20", + "metadata": {}, + "outputs": [], + "source": [ + "def banking_agent_fn(input):\n", + " \"\"\"\n", + " Invoke the banking agent with the given input.\n", + " \"\"\"\n", + " try:\n", + " # Initial state for banking agent\n", + " initial_state = {\n", + " \"user_input\": input[\"input\"],\n", + " \"messages\": [HumanMessage(content=input[\"input\"])],\n", + " \"session_id\": input[\"session_id\"],\n", + " \"context\": {}\n", + " }\n", + " session_config = {\"configurable\": {\"thread_id\": input[\"session_id\"]}}\n", + " result = banking_agent.invoke(initial_state, config=session_config)\n", + "\n", + " from utils import capture_tool_output_messages\n", + "\n", + " # Capture all tool outputs and metadata\n", + " captured_data = capture_tool_output_messages(result)\n", + " \n", + " # Access specific tool outputs, this will be used for RAGAS tests\n", + " tool_message = \"\"\n", + " for output in captured_data[\"tool_outputs\"]:\n", + " tool_message += output['content']\n", + " \n", + " tool_calls_found = []\n", + " messages = result['messages']\n", + " for message in messages:\n", + " if hasattr(message, 'tool_calls') and message.tool_calls:\n", + " for tool_call in message.tool_calls:\n", + " # Handle both dictionary and object formats\n", + " if isinstance(tool_call, dict):\n", + " tool_calls_found.append(tool_call['name'])\n", + " else:\n", + " # ToolCall object - use attribute access\n", + " tool_calls_found.append(tool_call.name)\n", + "\n", + "\n", + " return {\n", + " \"prediction\": result['messages'][-1].content[0]['text'],\n", + " \"output\": result,\n", + " \"tool_messages\": [tool_message],\n", + " # \"tool_calls\": tool_calls_found,\n", + " \"tool_called\": _convert_to_tool_call_list(extract_tool_calls_from_agent_output(result))\n", + " }\n", + " except Exception as e:\n", + " # Return a fallback response if the agent fails\n", + " error_message = f\"\"\"I apologize, but I encountered an error while processing your banking request: {str(e)}.\n", + " Please try rephrasing your question or contact support if the issue persists.\"\"\"\n", + " return {\n", + " \"prediction\": error_message, \n", + " \"output\": {\n", + " \"messages\": [HumanMessage(content=input[\"input\"]), SystemMessage(content=error_message)],\n", + " \"error\": str(e)\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "4ea44f1e", + "metadata": {}, + "source": [ + "#### Initialize the ValidMind model object\n", + "\n", + "\n", + "We'll also need to register the banking agent as a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data.\n", + "\n", + "You simply initialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model) that:\n", + "\n", + "- Associates the wrapper function with the model for prediction\n", + "- Stores the system prompt template for documentation\n", + "- Provides a unique `input_id` for tracking and identification\n", + "- Enables the agent to be used with ValidMind's testing and documentation features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4389e36", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the agent as a model\n", + "vm_banking_model = vm.init_model(\n", + " input_id=\"banking_agent_model\",\n", + " predict_fn=banking_agent_fn,\n", + " prompt=Prompt(template=system_context)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cd6eb68b", + "metadata": {}, + "source": [ + "#### Store the agent reference\n", + "\n", + "We'll also store a reference to the original banking agent object in the ValidMind model. This allows us to access the full agent functionality directly if needed, while still maintaining the wrapper function interface for ValidMind's testing framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e39d400", + "metadata": {}, + "outputs": [], + "source": [ + "# Add the banking agent to the vm model\n", + "vm_banking_model.model = banking_agent" + ] + }, + { + "cell_type": "markdown", + "id": "2db4b849", + "metadata": {}, + "source": [ + "#### Verify integration\n", + "\n", + "Let's confirm that the banking agent has been successfully integrated with ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59afbb6d", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Banking Agent Successfully Integrated with ValidMind!\")\n", + "print(f\"Model ID: {vm_banking_model.input_id}\")" + ] + }, + { + "cell_type": "markdown", + "id": "72041947", + "metadata": {}, + "source": [ + "## Running tests\n", + "\n", + "You run individual tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module.\n", + "\n", + "In this section, we'll run validation tests on both our defined system prompt as well as evaluation tests on our agent's performance." + ] + }, + { + "cell_type": "markdown", + "id": "af84f571", + "metadata": {}, + "source": [ + "### Validate the system prompt\n", + "\n", + "Let's get an initial sense of how well our defined system prompt meets a few best practices for prompt engineering.\n", + "\n", + "Passing in our agentic model as an input, the tests below rate the prompt on a scale of 1-10 against the following criteria:\n", + "\n", + "- **[Clarity](https://docs.validmind.ai/tests/prompt_validation/Clarity.html)** — How clearly the prompt states the task.\n", + "- **[Conciseness](https://docs.validmind.ai/tests/prompt_validation/Conciseness.html)** — How succinctly the prompt states the task.\n", + "- **[Delimitation](https://docs.validmind.ai/tests/prompt_validation/Delimitation.html)** — When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", + "- **[NegativeInstruction](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — Whether the prompt contains negative instructions.\n", + "- **[Specificity](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — How specific the prompt defines the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f52dceb1", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Clarity\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70d52333", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Conciseness\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aa89976", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Delimitation\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8630197e", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.NegativeInstruction\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3bd1038", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Specificity\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "99e70a96", + "metadata": {}, + "source": [ + "### Evaluate the banking agent\n", + "\n", + "After validating our system prompt, let's move on to evaluating the agent we built." + ] + }, + { + "cell_type": "markdown", + "id": "9035ae24", + "metadata": {}, + "source": [ + "#### Initialize the ValidMind datasets\n", + "\n", + "First, let's import our sample dataset ([banking_test_dataset.py](banking_test_dataset.py)), which we'll use to evaluate our agent's performance across different banking scenarios:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0620699", + "metadata": {}, + "outputs": [], + "source": [ + "from banking_test_dataset import banking_test_dataset" + ] + }, + { + "cell_type": "markdown", + "id": "22f93945", + "metadata": {}, + "source": [ + "The next step is to connect your data with a ValidMind `Dataset` object. **This step is always necessary every time you want to connect a dataset to documentation and produce test results through ValidMind,** but you only need to do it once per dataset.\n", + "\n", + "Initialize a ValidMind dataset object using the [`init_dataset` function](https://docs.validmind.ai/validmind/validmind.html#init_dataset) from the ValidMind (`vm`) module. For this example, we'll pass in the following arguments:\n", + "\n", + "- **`input_id`** — A unique identifier that allows tracking what inputs are used when running each individual test.\n", + "- **`dataset`** — The raw dataset that you want to provide as input to tests.\n", + "- **`text_column`** — The name of the column containing the text input data.\n", + "- **`target_column`** — A required argument if tests require access to true values. This is the name of the target column in the dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b90a7dfd", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset = vm.init_dataset(\n", + " input_id=\"banking_test_dataset\",\n", + " dataset=banking_test_dataset.sample(2),\n", + " text_column=\"input\",\n", + " target_column=\"possible_outputs\",\n", + ")\n", + "\n", + "print(\"Banking Test Dataset Initialized in ValidMind!\")\n", + "print(f\"Dataset ID: {vm_test_dataset.input_id}\")\n", + "print(f\"Dataset columns: {vm_test_dataset._df.columns}\")\n", + "vm_test_dataset._df.head(1)" + ] + }, + { + "cell_type": "markdown", + "id": "8b7da187", + "metadata": {}, + "source": [ + "#### Assign predictions\n", + "\n", + "Now that both the model object and the datasets have been registered, we'll assign predictions to capture the banking agent's responses for evaluation:\n", + "\n", + "- The [`assign_predictions()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#assign_predictions) from the `Dataset` object can link existing predictions to any number of models.\n", + "- This method links the model's class prediction values and probabilities to our `vm_train_ds` and `vm_test_ds` datasets.\n", + "\n", + "If no prediction values are passed, the method will compute predictions automatically:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83350c38", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_predictions(vm_banking_model)\n", + "\n", + "print(\"Banking Agent Predictions Generated Successfully!\")\n", + "print(f\"Predictions assigned to {len(vm_test_dataset._df)} test cases\")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "next-steps-24e25294-3e59-4982-92d0-a998cfbe3bb5", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "You can look at the output produced by the ValidMind Library right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation." + ] + }, + { + "cell_type": "markdown", + "id": "next-docs-7773f25a-ef1c-40d1-9b4b-f6cecdf7dc6c", + "metadata": {}, + "source": [ + "### Work with your model documentation\n", + "\n", + "1. From the **Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", + "\n", + "2. In the left sidebar that appears for your model, click **Documentation** under Documents.\n", + "\n", + "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)" + ] + }, + { + "cell_type": "markdown", + "id": "next-resources-779f3903-eb13-4242-9438-e0dfc1753d4d", + "metadata": {}, + "source": [ + "### Discover more learning resources\n", + "\n", + "We offer many interactive notebooks to help you document models:\n", + "\n", + "- [Run tests & test suites](https://docs.validmind.ai/guide/testing-overview.html)\n", + "- [Code samples](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-vm-362dbd51-d699-405b-b817-f95381861976", + "metadata": {}, + "source": [ + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "upgrade-show-2ce32ad2-2751-4b40-a1ec-161b178603b1", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-version-61359c61-7218-4a90-9f6f-70794c42d997", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-restart-c45548b7-aee3-4b96-b412-37c781071787", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-99bac407-03cd-4fe3-83c8-0a280d4caa64", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10" } - }, - "outputs": [], - "source": [ - "%pip show validmind" - ] - }, - { - "cell_type": "markdown", - "id": "upgrade-version-61359c61-7218-4a90-9f6f-70794c42d997", - "metadata": {}, - "source": [ - "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", - "\n", - "```bash\n", - "%pip install --upgrade validmind\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "upgrade-restart-c45548b7-aee3-4b96-b412-37c781071787", - "metadata": {}, - "source": [ - "You may need to restart your kernel after running the upgrade package for changes to be applied." - ] - }, - { - "cell_type": "markdown", - "id": "copyright-99bac407-03cd-4fe3-83c8-0a280d4caa64", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" }, - "language_info": { - "name": "python", - "version": "3.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } From f55f09923d326f535989011e18095804f1eb23d0 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Mon, 26 Jan 2026 16:26:55 -0800 Subject: [PATCH 23/54] Save point --- .../agents/document_agentic_ai.ipynb | 40 ++++++------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 2ad70aa10..4583cee3f 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -890,18 +890,6 @@ "print(f\"Model ID: {vm_banking_model.input_id}\")" ] }, - { - "cell_type": "markdown", - "id": "72041947", - "metadata": {}, - "source": [ - "## Running tests\n", - "\n", - "You run individual tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module.\n", - "\n", - "In this section, we'll run validation tests on both our defined system prompt as well as evaluation tests on our agent's performance." - ] - }, { "cell_type": "markdown", "id": "af84f571", @@ -909,9 +897,9 @@ "source": [ "### Validate the system prompt\n", "\n", - "Let's get an initial sense of how well our defined system prompt meets a few best practices for prompt engineering.\n", + "Let's get an initial sense of how well our defined system prompt meets a few best practices for prompt engineering by running a few tests — we'll run evaluation tests later on our agent's performance.\n", "\n", - "Passing in our agentic model as an input, the tests below rate the prompt on a scale of 1-10 against the following criteria:\n", + "You run individual tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module. Passing in our agentic model as an input, the tests below rate the prompt on a scale of 1-10 against the following criteria:\n", "\n", "- **[Clarity](https://docs.validmind.ai/tests/prompt_validation/Clarity.html)** — How clearly the prompt states the task.\n", "- **[Conciseness](https://docs.validmind.ai/tests/prompt_validation/Conciseness.html)** — How succinctly the prompt states the task.\n", @@ -995,24 +983,14 @@ ").log()" ] }, - { - "cell_type": "markdown", - "id": "99e70a96", - "metadata": {}, - "source": [ - "### Evaluate the banking agent\n", - "\n", - "After validating our system prompt, let's move on to evaluating the agent we built." - ] - }, { "cell_type": "markdown", "id": "9035ae24", "metadata": {}, "source": [ - "#### Initialize the ValidMind datasets\n", + "## Initialize the ValidMind datasets\n", "\n", - "First, let's import our sample dataset ([banking_test_dataset.py](banking_test_dataset.py)), which we'll use to evaluate our agent's performance across different banking scenarios:" + "After validation our system prompt, let's import our sample dataset ([banking_test_dataset.py](banking_test_dataset.py)), which we'll use in the next section to evaluate our agent's performance across different banking scenarios:" ] }, { @@ -1065,7 +1043,7 @@ "id": "8b7da187", "metadata": {}, "source": [ - "#### Assign predictions\n", + "### Assign predictions\n", "\n", "Now that both the model object and the datasets have been registered, we'll assign predictions to capture the banking agent's responses for evaluation:\n", "\n", @@ -1089,6 +1067,14 @@ "vm_test_dataset._df.head()" ] }, + { + "cell_type": "markdown", + "id": "f1deb8b6", + "metadata": {}, + "source": [ + "## Running evaluation tests" + ] + }, { "cell_type": "markdown", "id": "next-steps-24e25294-3e59-4982-92d0-a998cfbe3bb5", From 83dbc8acc2004734e349a7f625f2627bd50cf6b7 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 15:13:21 -0800 Subject: [PATCH 24/54] Applying some of Anil's changes to the edited cells --- .../agents/document_agentic_ai.ipynb | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 4583cee3f..3fc8bd8dd 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -489,12 +489,13 @@ "metadata": {}, "outputs": [], "source": [ + "\n", "# Enhanced banking system prompt with tool selection guidance\n", "system_context = \"\"\"You are a professional banking AI assistant with access to specialized banking tools.\n", " Analyze the user's banking request and directly use the most appropriate tools to help them.\n", - "\n", + " \n", " AVAILABLE BANKING TOOLS:\n", - "\n", + " \n", " credit_risk_analyzer - Analyze credit risk for loan applications and credit decisions\n", " - Use for: loan applications, credit assessments, risk analysis, mortgage eligibility\n", " - Examples: \"Analyze credit risk for $50k personal loan\", \"Assess mortgage eligibility for $300k home purchase\"\n", @@ -522,6 +523,9 @@ " - Be professional and thorough in your analysis\n", "\n", " Choose and use tools wisely to provide the most helpful banking assistance.\n", + " Describe the response in user friendly manner with details describing the tool output. \n", + " Provide the response in at least 500 words.\n", + " Generate a concise execution plan for the banking request.\n", " \"\"\"" ] }, @@ -1027,7 +1031,7 @@ "source": [ "vm_test_dataset = vm.init_dataset(\n", " input_id=\"banking_test_dataset\",\n", - " dataset=banking_test_dataset.sample(2),\n", + " dataset=banking_test_dataset,\n", " text_column=\"input\",\n", " target_column=\"possible_outputs\",\n", ")\n", @@ -1035,7 +1039,7 @@ "print(\"Banking Test Dataset Initialized in ValidMind!\")\n", "print(f\"Dataset ID: {vm_test_dataset.input_id}\")\n", "print(f\"Dataset columns: {vm_test_dataset._df.columns}\")\n", - "vm_test_dataset._df.head(1)" + "vm_test_dataset._df" ] }, { @@ -1130,11 +1134,7 @@ "cell_type": "code", "execution_count": null, "id": "upgrade-show-2ce32ad2-2751-4b40-a1ec-161b178603b1", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "%pip show validmind" @@ -1179,13 +1179,13 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "ValidMind Library", "language": "python", - "name": "python3" + "name": "validmind" }, "language_info": { "name": "python", - "version": "3.10" + "version": "3.10.13" } }, "nbformat": 4, From 4b55be1b242c1ffadc30c318e2d736c6205c9149 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 15:19:06 -0800 Subject: [PATCH 25/54] Setup rest of headings --- .../agents/document_agentic_ai.ipynb | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 3fc8bd8dd..5f0ea8036 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1079,6 +1079,38 @@ "## Running evaluation tests" ] }, + { + "cell_type": "markdown", + "id": "6eab4c9f", + "metadata": {}, + "source": [ + "### Implement custom tests" + ] + }, + { + "cell_type": "markdown", + "id": "b455da6e", + "metadata": {}, + "source": [ + "### Assign scores" + ] + }, + { + "cell_type": "markdown", + "id": "8d80886e", + "metadata": {}, + "source": [ + "### Run out-of-the box tests" + ] + }, + { + "cell_type": "markdown", + "id": "b62f3cc9", + "metadata": {}, + "source": [ + "## In summary" + ] + }, { "cell_type": "markdown", "id": "next-steps-24e25294-3e59-4982-92d0-a998cfbe3bb5", From 762c44285e058cb5054fe273e32275edadb113d7 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 15:32:13 -0800 Subject: [PATCH 26/54] Save point --- .../agents/document_agentic_ai.ipynb | 194 +++++++++++++++++- 1 file changed, 193 insertions(+), 1 deletion(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 5f0ea8036..b9f4ee473 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1084,7 +1084,199 @@ "id": "6eab4c9f", "metadata": {}, "source": [ - "### Implement custom tests" + "### Implement custom accuracy tests\n", + "\n", + "Let's implement some custom *inline tests* to assess the accuracy of our banking agent:\n", + "\n", + "- An inline test refers to a test written and executed within the same environment as the code being tested — in this case, right in this Jupyter Notebook — without requiring a separate test file or framework.\n", + "- You'll note that the custom test functions are just regular Python functions that can include and require any Python library as you see fit." + ] + }, + { + "cell_type": "markdown", + "id": "fe9c4e8b", + "metadata": {}, + "source": [ + "#### Create banking accuracy test\n", + "\n", + "This custom test evaluates the banking agent's ability to provide accurate responses by:\n", + "\n", + "- Testing against a dataset of predefined banking questions and expected answers.\n", + "- Checking if responses contain expected keywords and banking terminology.\n", + "- Providing detailed test results including pass/fail status.\n", + "- Helping identify any gaps in the agent's banking knowledge or response quality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "335aeedc", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "@vm.test(\"my_custom_tests.banking_accuracy_test\")\n", + "def banking_accuracy_test(model, dataset, list_of_columns):\n", + " \"\"\"\n", + " The Banking Accuracy Test evaluates whether the agent’s responses include \n", + " critical domain-specific keywords and phrases that indicate accurate, compliant,\n", + " and contextually appropriate banking information. This test ensures that the agent\n", + " provides responses containing the expected banking terminology, risk classifications,\n", + " account details, or other domain-relevant information required for regulatory compliance,\n", + " customer safety, and operational accuracy.\n", + " \"\"\"\n", + " df = dataset._df\n", + " \n", + " # Pre-compute responses for all tests\n", + " y_true = dataset.y.tolist()\n", + " y_pred = dataset.y_pred(model).tolist()\n", + "\n", + " # Vectorized test results\n", + " test_results = []\n", + " for response, keywords in zip(y_pred, y_true):\n", + " # Convert keywords to list if not already a list\n", + " if not isinstance(keywords, list):\n", + " keywords = [keywords]\n", + " test_results.append(any(str(keyword).lower() in str(response).lower() for keyword in keywords))\n", + " \n", + " results = pd.DataFrame()\n", + " column_names = [col + \"_details\" for col in list_of_columns]\n", + " results[column_names] = df[list_of_columns]\n", + " results[\"actual\"] = y_pred\n", + " results[\"expected\"] = y_true\n", + " results[\"passed\"] = test_results\n", + " results[\"error\"] = None if test_results else f'Response did not contain any expected keywords: {y_true}'\n", + " \n", + " return results\n", + " \n", + "result = vm.test.run_test(\n", + " \"my_custom_tests.banking_accuracy_test\",\n", + " inputs={\n", + " \"dataset\": vm_test_dataset,\n", + " \"model\": vm_banking_model\n", + " },\n", + " params={\n", + " \"list_of_columns\": [\"input\"]\n", + " }\n", + ")\n", + "result.log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ab0a492", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.df.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "edf16008", + "metadata": {}, + "source": [ + "#### Create banking tool call accuracy test\n", + "\n", + "This custom test evaluates how accurately our intelligent banking router selects the correct tools for different banking requests by providing quantitative feedback on the agent's core intelligence - its ability to understand what users need and select the right banking tools to help them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62e28f11", + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.BankingToolCallAccuracy\")\n", + "def BankingToolCallAccuracy(dataset, agent_output_column, expected_tools_column):\n", + " \"\"\"\n", + " Evaluates the tool selection accuracy of a LangGraph-powered banking agent.\n", + "\n", + " This test measures whether the agent correctly identifies and invokes the required banking tools\n", + " for each user query scenario.\n", + " For each case, the outputs generated by the agent (including its tool calls) are compared against an\n", + " expected set of tools. The test considers both coverage and exactness: it computes the proportion of\n", + " expected tools correctly called by the agent for each instance.\n", + "\n", + " Parameters:\n", + " dataset (VMDataset): The dataset containing user queries, agent outputs, and ground-truth tool expectations.\n", + " agent_output_column (str): Dataset column name containing agent outputs (should include tool call details in 'messages').\n", + " expected_tools_column (str): Dataset column specifying the true expected tools (as lists).\n", + "\n", + " Returns:\n", + " List[dict]: Per-row dictionaries with details: expected tools, found tools, match count, total expected, and accuracy score.\n", + "\n", + " Purpose:\n", + " Provides diagnostic evidence of the banking agent's core reasoning ability—specifically, its capacity to\n", + " interpret user needs and select the correct banking actions. Useful for diagnosing gaps in tool coverage,\n", + " misclassifications, or breakdowns in agent logic.\n", + "\n", + " Interpretation:\n", + " - An accuracy of 1.0 signals perfect tool selection for that example.\n", + " - Lower scores may indicate partial or complete failures to invoke required tools.\n", + " - Review 'found_tools' vs. 'expected_tools' to understand the source of discrepancies.\n", + "\n", + " Strengths:\n", + " - Directly tests a core capability of compositional tool-use agents.\n", + " - Framework-agnostic; robust to tool call output format (object or dict).\n", + " - Supports batch validation and result logging for systematic documentation.\n", + "\n", + " Limitations:\n", + " - Does not penalize extra, unnecessary tool calls.\n", + " - Does not assess result quality—only correct invocation.\n", + "\n", + " \"\"\"\n", + " def validate_tool_calls_simple(messages, expected_tools):\n", + " \"\"\"Simple validation of tool calls without RAGAS dependency issues.\"\"\"\n", + " \n", + " tool_calls_found = []\n", + " \n", + " for message in messages:\n", + " if hasattr(message, 'tool_calls') and message.tool_calls:\n", + " for tool_call in message.tool_calls:\n", + " # Handle both dictionary and object formats\n", + " if isinstance(tool_call, dict):\n", + " tool_calls_found.append(tool_call['name'])\n", + " else:\n", + " # ToolCall object - use attribute access\n", + " tool_calls_found.append(tool_call.name)\n", + " \n", + " # Check if expected tools were called\n", + " accuracy = 0.0\n", + " matches = 0\n", + " if expected_tools:\n", + " matches = sum(1 for tool in expected_tools if tool in tool_calls_found)\n", + " accuracy = matches / len(expected_tools)\n", + " \n", + " return {\n", + " 'expected_tools': expected_tools,\n", + " 'found_tools': tool_calls_found,\n", + " 'matches': matches,\n", + " 'total_expected': len(expected_tools) if expected_tools else 0,\n", + " 'accuracy': accuracy,\n", + " }\n", + "\n", + " df = dataset._df\n", + " \n", + " results = []\n", + " for i, row in df.iterrows():\n", + " result = validate_tool_calls_simple(row[agent_output_column]['messages'], row[expected_tools_column])\n", + " results.append(result)\n", + " \n", + " return results\n", + "\n", + "vm.test.run_test(\n", + " \"my_custom_tests.BankingToolCallAccuracy\",\n", + " inputs = {\n", + " \"dataset\": vm_test_dataset,\n", + " },\n", + " params = {\n", + " \"agent_output_column\": \"banking_agent_model_output\",\n", + " \"expected_tools_column\": \"expected_tools\"\n", + " }\n", + ").log()" ] }, { From c8e5dbe8c13e2c9833d240fefa69c9a6deac4f17 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 15:39:33 -0800 Subject: [PATCH 27/54] Save point --- .../agents/document_agentic_ai.ipynb | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index b9f4ee473..83bd527f9 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1149,7 +1149,7 @@ " \n", " return results\n", " \n", - "result = vm.test.run_test(\n", + "result = vm.tests.run_test(\n", " \"my_custom_tests.banking_accuracy_test\",\n", " inputs={\n", " \"dataset\": vm_test_dataset,\n", @@ -1159,16 +1159,8 @@ " \"list_of_columns\": [\"input\"]\n", " }\n", ")\n", - "result.log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ab0a492", - "metadata": {}, - "outputs": [], - "source": [ + "result.log()\n", + "\n", "vm_test_dataset.df.head(5)" ] }, @@ -1267,7 +1259,7 @@ " \n", " return results\n", "\n", - "vm.test.run_test(\n", + "vm.tests.run_test(\n", " \"my_custom_tests.BankingToolCallAccuracy\",\n", " inputs = {\n", " \"dataset\": vm_test_dataset,\n", @@ -1408,7 +1400,15 @@ "name": "validmind" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", "version": "3.10.13" } }, From dc48a538ebc025ff84a2e784a98540cc1eb6a341 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 16:00:25 -0800 Subject: [PATCH 28/54] Save point --- .../agents/document_agentic_ai.ipynb | 38 +++++++++++++++---- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 83bd527f9..1b7b88a7e 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1084,9 +1084,9 @@ "id": "6eab4c9f", "metadata": {}, "source": [ - "### Implement custom accuracy tests\n", + "### Run custom accuracy tests\n", "\n", - "Let's implement some custom *inline tests* to assess the accuracy of our banking agent:\n", + "Using [`@vm.test`](https://docs.validmind.ai/validmind/validmind.html#test), let's implement some reusable custom *inline tests* to assess the accuracy of our banking agent:\n", "\n", "- An inline test refers to a test written and executed within the same environment as the code being tested — in this case, right in this Jupyter Notebook — without requiring a separate test file or framework.\n", "- You'll note that the custom test functions are just regular Python functions that can include and require any Python library as you see fit." @@ -1097,9 +1097,9 @@ "id": "fe9c4e8b", "metadata": {}, "source": [ - "#### Create banking accuracy test\n", + "#### Response accuracy test\n", "\n", - "This custom test evaluates the banking agent's ability to provide accurate responses by:\n", + "We'll create a custom test that evaluates the banking agent's ability to provide accurate responses by:\n", "\n", "- Testing against a dataset of predefined banking questions and expected answers.\n", "- Checking if responses contain expected keywords and banking terminology.\n", @@ -1147,8 +1147,32 @@ " results[\"passed\"] = test_results\n", " results[\"error\"] = None if test_results else f'Response did not contain any expected keywords: {y_true}'\n", " \n", - " return results\n", - " \n", + " return results" + ] + }, + { + "cell_type": "markdown", + "id": "f3ca07d3", + "metadata": {}, + "source": [ + "Now that we've defined our custom response accuracy test, we can run the test using the same `run_test()` function we used earlier to validate the system prompt using our sample dataset and agentic model as input, and log the test results to the ValidMind Platform with the `log()` method.\n", + "\n", + "We'll also display the first five rows of the test dataset to inspect the results here to see how well the banking agent performed. The results DataFrame contains:\n", + "\n", + "- **Input details**: The original input columns from the dataset\n", + "- **Actual**: The agent's actual response\n", + "- **Expected**: The expected keywords that should appear in the response\n", + "- **Passed**: A boolean indicating whether the response contained the expected keywords\n", + "- **Error**: Any error messages if the test failed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "567a058a", + "metadata": {}, + "outputs": [], + "source": [ "result = vm.tests.run_test(\n", " \"my_custom_tests.banking_accuracy_test\",\n", " inputs={\n", @@ -1169,7 +1193,7 @@ "id": "edf16008", "metadata": {}, "source": [ - "#### Create banking tool call accuracy test\n", + "#### Tool selection accuracy test\n", "\n", "This custom test evaluates how accurately our intelligent banking router selects the correct tools for different banking requests by providing quantitative feedback on the agent's core intelligence - its ability to understand what users need and select the right banking tools to help them." ] From f50772e904a405a7436b78fbbe660c45d5f93dda Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 16:10:36 -0800 Subject: [PATCH 29/54] Save point --- .../agents/document_agentic_ai.ipynb | 34 +++++++++++++------ 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 1b7b88a7e..b64d4db5a 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1155,15 +1155,7 @@ "id": "f3ca07d3", "metadata": {}, "source": [ - "Now that we've defined our custom response accuracy test, we can run the test using the same `run_test()` function we used earlier to validate the system prompt using our sample dataset and agentic model as input, and log the test results to the ValidMind Platform with the `log()` method.\n", - "\n", - "We'll also display the first five rows of the test dataset to inspect the results here to see how well the banking agent performed. The results DataFrame contains:\n", - "\n", - "- **Input details**: The original input columns from the dataset\n", - "- **Actual**: The agent's actual response\n", - "- **Expected**: The expected keywords that should appear in the response\n", - "- **Passed**: A boolean indicating whether the response contained the expected keywords\n", - "- **Error**: Any error messages if the test failed" + "Now that we've defined our custom response accuracy test, we can run the test using the same `run_test()` function we used earlier to validate the system prompt using our sample dataset and agentic model as input, and log the test results to the ValidMind Platform with the `log()` method:" ] }, { @@ -1183,8 +1175,30 @@ " \"list_of_columns\": [\"input\"]\n", " }\n", ")\n", - "result.log()\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "id": "397b3a0d", + "metadata": {}, + "source": [ + "Let's review the first five rows of the test dataset to inspect the results to see how well the banking agent performed. The results DataFrame contains:\n", "\n", + "- **Input details**: The original input columns from the dataset\n", + "- **Actual**: The agent's actual response\n", + "- **Expected**: The expected keywords that should appear in the response\n", + "- **Passed**: A boolean indicating whether the response contained the expected keywords\n", + "- **Error**: Any error messages if the test failed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd9a2155", + "metadata": {}, + "outputs": [], + "source": [ "vm_test_dataset.df.head(5)" ] }, From c35683ef01a6ee6d3dd53e081fb33147b6f2f632 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 16:26:27 -0800 Subject: [PATCH 30/54] =?UTF-8?q?Running=20evaluation=20tests=20=E2=80=94?= =?UTF-8?q?=20Custom=20response=20accuracy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../agents/document_agentic_ai.ipynb | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index b64d4db5a..27d92621d 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1155,7 +1155,7 @@ "id": "f3ca07d3", "metadata": {}, "source": [ - "Now that we've defined our custom response accuracy test, we can run the test using the same `run_test()` function we used earlier to validate the system prompt using our sample dataset and agentic model as input, and log the test results to the ValidMind Platform with the `log()` method:" + "Now that we've defined our custom response accuracy test, we can run the test using the same `run_test()` function we used earlier to validate the system prompt using our sample dataset and agentic model as input, and log the test results to the ValidMind Platform with the [`log()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#log):" ] }, { @@ -1183,13 +1183,17 @@ "id": "397b3a0d", "metadata": {}, "source": [ - "Let's review the first five rows of the test dataset to inspect the results to see how well the banking agent performed. The results DataFrame contains:\n", + "Let's review the first five rows of the test dataset to inspect the results to see how well the banking agent performed. Each column in the output serves a specific purpose in evaluating agent performance:\n", "\n", - "- **Input details**: The original input columns from the dataset\n", - "- **Actual**: The agent's actual response\n", - "- **Expected**: The expected keywords that should appear in the response\n", - "- **Passed**: A boolean indicating whether the response contained the expected keywords\n", - "- **Error**: Any error messages if the test failed" + "- **input** — The original user query or request — essential for understanding the context of each test case and tracing which inputs led to specific agent behaviors\n", + "- **expected_tools** — The banking tools that should be invoked for this request — enables validation of correct tool selection, which is critical for agentic AI systems where choosing the right tool is a key success metric\n", + "- **expected_output** — The expected output or keywords that should appear in the response — defines the success criteria for each test case, enabling objective evaluation of whether the agent produced the correct result\n", + "- **session_id** — A unique identifier for each test session — allows tracking and correlation of related test runs, debugging specific sessions, and maintaining audit trails\n", + "- **category**: The classification of the request type (e.g., credit_risk, account_management, fraud_detection) — helps organize test results by domain and identify performance patterns across different banking use cases\n", + "- **banking_agent_model_output** — The complete agent response including all messages and reasoning — allows you to examine the full output to assess response quality, completeness, and correctness beyond just keyword matching\n", + "- **banking_agent_model_tool_messages** — The messages exchanged with the banking tools — critical for understanding how the agent interacted with tools, what parameters were passed, and what tool outputs were received\n", + "- **banking_agent_model_tool_called** — The specific tool that was invoked — enables validation that the agent selected the correct tool for each request, which is fundamental to agentic AI validation\n", + "- **possible_outputs** — Alternative valid outputs or keywords that could appear in the response — provides flexibility in evaluation by accounting for multiple acceptable response formats or variations" ] }, { From d9498a1924cfc01d7cd1be26ef76bcf9287b9a37 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 16:44:23 -0800 Subject: [PATCH 31/54] Save point --- .../agents/document_agentic_ai.ipynb | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 27d92621d..1af5a3920 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1185,15 +1185,15 @@ "source": [ "Let's review the first five rows of the test dataset to inspect the results to see how well the banking agent performed. Each column in the output serves a specific purpose in evaluating agent performance:\n", "\n", - "- **input** — The original user query or request — essential for understanding the context of each test case and tracing which inputs led to specific agent behaviors\n", - "- **expected_tools** — The banking tools that should be invoked for this request — enables validation of correct tool selection, which is critical for agentic AI systems where choosing the right tool is a key success metric\n", - "- **expected_output** — The expected output or keywords that should appear in the response — defines the success criteria for each test case, enabling objective evaluation of whether the agent produced the correct result\n", - "- **session_id** — A unique identifier for each test session — allows tracking and correlation of related test runs, debugging specific sessions, and maintaining audit trails\n", - "- **category**: The classification of the request type (e.g., credit_risk, account_management, fraud_detection) — helps organize test results by domain and identify performance patterns across different banking use cases\n", - "- **banking_agent_model_output** — The complete agent response including all messages and reasoning — allows you to examine the full output to assess response quality, completeness, and correctness beyond just keyword matching\n", - "- **banking_agent_model_tool_messages** — The messages exchanged with the banking tools — critical for understanding how the agent interacted with tools, what parameters were passed, and what tool outputs were received\n", - "- **banking_agent_model_tool_called** — The specific tool that was invoked — enables validation that the agent selected the correct tool for each request, which is fundamental to agentic AI validation\n", - "- **possible_outputs** — Alternative valid outputs or keywords that could appear in the response — provides flexibility in evaluation by accounting for multiple acceptable response formats or variations" + "- **input** (original user query or request) — Essential for understanding the context of each test case and tracing which inputs led to specific agent behaviors.\n", + "- **expected_tools** (banking tools that should be invoked for this request) — Enables validation of correct tool selection, which is critical for agentic AI systems where choosing the right tool is a key success metric.\n", + "- **expected_output** (expected output or keywords that should appear in the response) — Defines the success criteria for each test case, enabling objective evaluation of whether the agent produced the correct result.\n", + "- **session_id** (unique identifier for each test session) — Allows tracking and correlation of related test runs, debugging specific sessions, and maintaining audit trails.\n", + "- **category** (classification of the request type) — For example, `credit_risk`, `account_management`, `fraud_detection`; helps organize test results by domain and identify performance patterns across different banking use cases.\n", + "- **banking_agent_model_output** (complete agent response including all messages and reasoning) — Allows you to examine the full output to assess response quality, completeness, and correctness beyond just keyword matching.\n", + "- **banking_agent_model_tool_messages** (messages exchanged with the banking tools) — Critical for understanding how the agent interacted with tools, what parameters were passed, and what tool outputs were received.\n", + "- **banking_agent_model_tool_called** (specific tool that was invoked) — Enables validation that the agent selected the correct tool for each request, which is fundamental to agentic AI validation.\n", + "- **possible_outputs** (alternative valid outputs or keywords that could appear in the response) — Provides flexibility in evaluation by accounting for multiple acceptable response formats or variations." ] }, { From b02f6313e196329849d97142d7a5ca61a9cc986d Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 17:05:47 -0800 Subject: [PATCH 32/54] Save point --- .../agents/document_agentic_ai.ipynb | 146 ++++++++++++------ 1 file changed, 100 insertions(+), 46 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 1af5a3920..ffbe52a1d 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1185,15 +1185,17 @@ "source": [ "Let's review the first five rows of the test dataset to inspect the results to see how well the banking agent performed. Each column in the output serves a specific purpose in evaluating agent performance:\n", "\n", - "- **input** (original user query or request) — Essential for understanding the context of each test case and tracing which inputs led to specific agent behaviors.\n", - "- **expected_tools** (banking tools that should be invoked for this request) — Enables validation of correct tool selection, which is critical for agentic AI systems where choosing the right tool is a key success metric.\n", - "- **expected_output** (expected output or keywords that should appear in the response) — Defines the success criteria for each test case, enabling objective evaluation of whether the agent produced the correct result.\n", - "- **session_id** (unique identifier for each test session) — Allows tracking and correlation of related test runs, debugging specific sessions, and maintaining audit trails.\n", - "- **category** (classification of the request type) — For example, `credit_risk`, `account_management`, `fraud_detection`; helps organize test results by domain and identify performance patterns across different banking use cases.\n", - "- **banking_agent_model_output** (complete agent response including all messages and reasoning) — Allows you to examine the full output to assess response quality, completeness, and correctness beyond just keyword matching.\n", - "- **banking_agent_model_tool_messages** (messages exchanged with the banking tools) — Critical for understanding how the agent interacted with tools, what parameters were passed, and what tool outputs were received.\n", - "- **banking_agent_model_tool_called** (specific tool that was invoked) — Enables validation that the agent selected the correct tool for each request, which is fundamental to agentic AI validation.\n", - "- **possible_outputs** (alternative valid outputs or keywords that could appear in the response) — Provides flexibility in evaluation by accounting for multiple acceptable response formats or variations." + "| Column header | Description | Importance |\n", + "|--------------|-------------|------------|\n", + "| **`input`** | Original user query or request | Essential for understanding the context of each test case and tracing which inputs led to specific agent behaviors. |\n", + "| **`expected_tools`** | Banking tools that should be invoked for this request | Enables validation of correct tool selection, which is critical for agentic AI systems where choosing the right tool is a key success metric. |\n", + "| **`expected_output`** | Expected output or keywords that should appear in the response | Defines the success criteria for each test case, enabling objective evaluation of whether the agent produced the correct result. |\n", + "| **`session_id`** | Unique identifier for each test session | Allows tracking and correlation of related test runs, debugging specific sessions, and maintaining audit trails. |\n", + "| **`category`** | Classification of the request type | Helps organize test results by domain and identify performance patterns across different banking use cases. |\n", + "| **`banking_agent_model_output`** | Complete agent response including all messages and reasoning | Allows you to examine the full output to assess response quality, completeness, and correctness beyond just keyword matching. |\n", + "| **`banking_agent_model_tool_messages`** | Messages exchanged with the banking tools | Critical for understanding how the agent interacted with tools, what parameters were passed, and what tool outputs were received. |\n", + "| **`banking_agent_model_tool_called`** | Specific tool that was invoked | Enables validation that the agent selected the correct tool for each request, which is fundamental to agentic AI validation. |\n", + "| **`possible_outputs`** | Alternative valid outputs or keywords that could appear in the response | Provides flexibility in evaluation by accounting for multiple acceptable response formats or variations. |" ] }, { @@ -1213,13 +1215,72 @@ "source": [ "#### Tool selection accuracy test\n", "\n", - "This custom test evaluates how accurately our intelligent banking router selects the correct tools for different banking requests by providing quantitative feedback on the agent's core intelligence - its ability to understand what users need and select the right banking tools to help them." + "We'll create a custom test that evaluates the banking agent's ability to select the correct tools for different requests by:\n", + "\n", + "- Testing against a dataset of predefined banking queries with expected tool selections.\n", + "- Comparing the tools actually invoked by the agent against the expected tools for each request.\n", + "- Providing quantitative accuracy scores that measure the proportion of expected tools correctly selected.\n", + "- Helping identify gaps in the agent's understanding of user needs and tool selection logic." + ] + }, + { + "cell_type": "markdown", + "id": "62e28f11", + "metadata": {}, + "source": [ + "First, we'll define a helper function that extracts tool calls from the agent's messages and compares them against the expected tools. This function handles different message formats (dictionary or object) and calculates accuracy scores:" ] }, { "cell_type": "code", "execution_count": null, - "id": "62e28f11", + "id": "f8f18f1b", + "metadata": {}, + "outputs": [], + "source": [ + "def validate_tool_calls_simple(messages, expected_tools):\n", + " \"\"\"Simple validation of tool calls without RAGAS dependency issues.\"\"\"\n", + " \n", + " tool_calls_found = []\n", + " \n", + " for message in messages:\n", + " if hasattr(message, 'tool_calls') and message.tool_calls:\n", + " for tool_call in message.tool_calls:\n", + " # Handle both dictionary and object formats\n", + " if isinstance(tool_call, dict):\n", + " tool_calls_found.append(tool_call['name'])\n", + " else:\n", + " # ToolCall object - use attribute access\n", + " tool_calls_found.append(tool_call.name)\n", + " \n", + " # Check if expected tools were called\n", + " accuracy = 0.0\n", + " matches = 0\n", + " if expected_tools:\n", + " matches = sum(1 for tool in expected_tools if tool in tool_calls_found)\n", + " accuracy = matches / len(expected_tools)\n", + " \n", + " return {\n", + " 'expected_tools': expected_tools,\n", + " 'found_tools': tool_calls_found,\n", + " 'matches': matches,\n", + " 'total_expected': len(expected_tools) if expected_tools else 0,\n", + " 'accuracy': accuracy,\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "7fb499c4", + "metadata": {}, + "source": [ + "Now we'll define the main test function that uses the helper function to evaluate tool selection accuracy across all test cases in the dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d958ab5e", "metadata": {}, "outputs": [], "source": [ @@ -1262,36 +1323,6 @@ " - Does not assess result quality—only correct invocation.\n", "\n", " \"\"\"\n", - " def validate_tool_calls_simple(messages, expected_tools):\n", - " \"\"\"Simple validation of tool calls without RAGAS dependency issues.\"\"\"\n", - " \n", - " tool_calls_found = []\n", - " \n", - " for message in messages:\n", - " if hasattr(message, 'tool_calls') and message.tool_calls:\n", - " for tool_call in message.tool_calls:\n", - " # Handle both dictionary and object formats\n", - " if isinstance(tool_call, dict):\n", - " tool_calls_found.append(tool_call['name'])\n", - " else:\n", - " # ToolCall object - use attribute access\n", - " tool_calls_found.append(tool_call.name)\n", - " \n", - " # Check if expected tools were called\n", - " accuracy = 0.0\n", - " matches = 0\n", - " if expected_tools:\n", - " matches = sum(1 for tool in expected_tools if tool in tool_calls_found)\n", - " accuracy = matches / len(expected_tools)\n", - " \n", - " return {\n", - " 'expected_tools': expected_tools,\n", - " 'found_tools': tool_calls_found,\n", - " 'matches': matches,\n", - " 'total_expected': len(expected_tools) if expected_tools else 0,\n", - " 'accuracy': accuracy,\n", - " }\n", - "\n", " df = dataset._df\n", " \n", " results = []\n", @@ -1299,18 +1330,41 @@ " result = validate_tool_calls_simple(row[agent_output_column]['messages'], row[expected_tools_column])\n", " results.append(result)\n", " \n", - " return results\n", + " return results" + ] + }, + { + "cell_type": "markdown", + "id": "48836abb", + "metadata": {}, + "source": [ + "Now that we've defined our custom tool selection accuracy test, we can run the test using the same `run_test()` function we used earlier to validate the system prompt. This test requires:\n", "\n", - "vm.tests.run_test(\n", + "- **`dataset`** — Our test dataset containing banking queries, agent outputs, and expected tool selections.\n", + "- **`agent_output_column`** — The column name in the dataset that contains the agent's full output (including tool call details in the 'messages' field).\n", + "- **`expected_tools_column`** — The column name that specifies which tools should have been invoked for each query.\n", + "\n", + "The test will compare the tools actually called by the agent against the expected tools and compute accuracy scores for each test case. We'll log the test results to the ValidMind Platform with the [`log()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#log):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cc532d4", + "metadata": {}, + "outputs": [], + "source": [ + "result = vm.tests.run_test(\n", " \"my_custom_tests.BankingToolCallAccuracy\",\n", - " inputs = {\n", + " inputs={\n", " \"dataset\": vm_test_dataset,\n", " },\n", - " params = {\n", + " params={\n", " \"agent_output_column\": \"banking_agent_model_output\",\n", " \"expected_tools_column\": \"expected_tools\"\n", " }\n", - ").log()" + ")\n", + "result.log()" ] }, { From b188b4446797cf8409f8aa5a329e25724f3ce358 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 17:21:19 -0800 Subject: [PATCH 33/54] =?UTF-8?q?Running=20evaluation=20tests=20=E2=80=94?= =?UTF-8?q?=20Tool=20selection=20accuracy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../code_samples/agents/document_agentic_ai.ipynb | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index ffbe52a1d..362556d2c 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1215,7 +1215,7 @@ "source": [ "#### Tool selection accuracy test\n", "\n", - "We'll create a custom test that evaluates the banking agent's ability to select the correct tools for different requests by:\n", + "We'll also create a custom test that evaluates the banking agent's ability to select the correct tools for different requests by:\n", "\n", "- Testing against a dataset of predefined banking queries with expected tool selections.\n", "- Comparing the tools actually invoked by the agent against the expected tools for each request.\n", @@ -1338,13 +1338,7 @@ "id": "48836abb", "metadata": {}, "source": [ - "Now that we've defined our custom tool selection accuracy test, we can run the test using the same `run_test()` function we used earlier to validate the system prompt. This test requires:\n", - "\n", - "- **`dataset`** — Our test dataset containing banking queries, agent outputs, and expected tool selections.\n", - "- **`agent_output_column`** — The column name in the dataset that contains the agent's full output (including tool call details in the 'messages' field).\n", - "- **`expected_tools_column`** — The column name that specifies which tools should have been invoked for each query.\n", - "\n", - "The test will compare the tools actually called by the agent against the expected tools and compute accuracy scores for each test case. We'll log the test results to the ValidMind Platform with the [`log()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#log):" + "Finally, we can call our function with `run_test()` and log the test results to the ValidMind Platform:" ] }, { From b9765f5f67b5bfebe1a775b2c7c3fbe736e41396 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 17:49:49 -0800 Subject: [PATCH 34/54] Save point --- .../agents/document_agentic_ai.ipynb | 143 +++++++++++++++++- 1 file changed, 141 insertions(+), 2 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 362556d2c..d5a784576 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1366,7 +1366,63 @@ "id": "b455da6e", "metadata": {}, "source": [ - "### Assign scores" + "### Assign AI evaluation metric scores\n", + "\n", + "*Scorers* are evaluation metrics that analyze model outputs and store their results in the dataset:\n", + "\n", + "- Each scorer adds a new column to the dataset with format: `{scorer_name}_{metric_name}`\n", + "- The column contains the numeric score (typically `0`-`1`) for each example\n", + "- Multiple scorers can be run on the same dataset, each adding their own column\n", + "- Scores are persisted in the dataset for later analysis and visualization\n", + "- Common scorer patterns include:\n", + " - Model performance metrics (accuracy, F1, etc.)\n", + " - Output quality metrics (relevance, faithfulness)\n", + " - Task-specific metrics (completion, correctness)\n", + "\n", + "*AI agent evaluation metrics* are specialized measurements designed to assess how well autonomous LLM-based agents reason, plan, select and execute tools, and ultimately complete user tasks by analyzing the *full execution trace* — including reasoning steps, tool calls, intermediate decisions, and outcomes, rather than just single input–output pairs. These metrics are essential because agent failures often occur in ways traditional LLM metrics miss — for example, choosing the right tool with wrong arguments, creating a good plan but not following it, or completing a task inefficiently.\n", + "\n", + "We'll use the [`assign_scores()` method](https://docs.validmind.ai/validmind/validmind/tests.html#scorer) in this section to evaluate our banking agent's outputs and add scoring metrics to our sample dataset against metrics defined in [DeepEval’s AI agent evaluation framework](https://deepeval.com/guides/guides-ai-agent-evaluation-metrics) which breaks down AI agent evaluation metrics into three layers with corresponding metric categories.\n", + "\n", + "Together, the below three metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." + ] + }, + { + "cell_type": "markdown", + "id": "334a8456", + "metadata": {}, + "source": [ + "#### Reasoning layer\n", + "\n", + "*Reasoning* evaluates planning and strategy generation:\n", + "\n", + "- **PlanQualityMetric** – How logical, complete, and efficient the agent’s plan is.\n", + "- **PlanAdherenceMetric** – Whether the agent follows its own plan during execution." + ] + }, + { + "cell_type": "markdown", + "id": "df618f8c", + "metadata": {}, + "source": [ + "#### Action layer\n", + "\n", + "*Action* assesses tool usage and argument generation:\n", + "\n", + "- **ToolCorrectnessMetric** – Whether the agent selects and calls the right tools.\n", + "- **ArgumentCorrectnessMetric** – Whether the agent generates correct tool arguments." + ] + }, + { + "cell_type": "markdown", + "id": "3235d4ec", + "metadata": {}, + "source": [ + "#### Execution layer\n", + "\n", + "*Execution* measures end-to-end performance:\n", + "\n", + "- **TaskCompletionMetric** – Whether the agent successfully completes the intended task.\n", + "- **StepEfficiencyMetric** – Whether the agent avoids unnecessary or redundant steps." ] }, { @@ -1377,12 +1433,95 @@ "### Run out-of-the box tests" ] }, + { + "cell_type": "markdown", + "id": "9dab68d3", + "metadata": {}, + "source": [ + "### RAGAS tests" + ] + }, + { + "cell_type": "markdown", + "id": "6f2b5067", + "metadata": {}, + "source": [ + "#### Faithfulness" + ] + }, + { + "cell_type": "markdown", + "id": "fe65be0a", + "metadata": {}, + "source": [ + "#### Response relevancy" + ] + }, + { + "cell_type": "markdown", + "id": "9d2cb0b1", + "metadata": {}, + "source": [ + "#### Context recall" + ] + }, + { + "cell_type": "markdown", + "id": "4508379e", + "metadata": {}, + "source": [ + "#### Safety" + ] + }, { "cell_type": "markdown", "id": "b62f3cc9", "metadata": {}, "source": [ - "## In summary" + "## In summary**\n", + "\n", + "We have successfully built and tested a comprehensive **Banking AI Agent** using LangGraph and ValidMind. Here's what we've accomplished:\n", + "\n", + "\n", + "\n", + "### What We Built\n", + "\n", + "1. **5 Specialized Banking Tools**\n", + " - Credit Risk Analyzer for loan assessments\n", + " - Customer Account Manager for account services\n", + " - Fraud Detection System for security monitoring\n", + "\n", + "2. **Intelligent LangGraph Agent**\n", + " - Automatic tool selection based on user requests\n", + " - Banking-specific system prompts and guidance\n", + " - Professional banking assistance and responses\n", + "\n", + "3. **Comprehensive Testing Framework**\n", + " - banking-specific test cases\n", + " - ValidMind integration for validation\n", + " - Performance analysis across banking domains\n", + "\n", + "\n", + "\n", + "### Next Steps\n", + "\n", + "1. **Customize Tools**: Adapt the banking tools to your specific banking requirements\n", + "2. **Expand Test Cases**: Add more banking scenarios and edge cases\n", + "3. **Integrate with Real Data**: Connect to actual banking systems and databases\n", + "4. **Add More Tools**: Implement additional banking-specific functionality\n", + "5. **Production Deployment**: Deploy the agent in a production banking environment\n", + "\n", + "\n", + "\n", + "### Key Benefits\n", + "\n", + "- **Industry-Specific**: Designed specifically for banking operations\n", + "- **Regulatory Compliance**: Built-in SR 11-7 and SS 1-23 compliance checks\n", + "- **Risk Management**: Comprehensive credit and fraud risk assessment\n", + "- **Customer Focus**: Tools for both retail and commercial banking needs\n", + "- **Real-World Applicability**: Addresses actual banking use cases and challenges\n", + "\n", + "Your banking AI agent is now ready to handle real-world banking scenarios while maintaining regulatory compliance and risk management best practices!" ] }, { From de00940a7a113a15d044d69a272c321c899fb35c Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 18:02:22 -0800 Subject: [PATCH 35/54] Save point --- .../agents/document_agentic_ai.ipynb | 208 +++++++++++++++++- 1 file changed, 201 insertions(+), 7 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index d5a784576..b227bacc8 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1395,8 +1395,64 @@ "\n", "*Reasoning* evaluates planning and strategy generation:\n", "\n", - "- **PlanQualityMetric** – How logical, complete, and efficient the agent’s plan is.\n", - "- **PlanAdherenceMetric** – Whether the agent follows its own plan during execution." + "- **Plan quality** – How logical, complete, and efficient the agent’s plan is.\n", + "- **Plan adherence** – Whether the agent follows its own plan during execution." + ] + }, + { + "cell_type": "markdown", + "id": "a35321f1", + "metadata": {}, + "source": [ + "##### Plan quality metric score\n", + "\n", + "Let's measure how well our banking agent generates a plan before acting. A high score means the plan is logical, complete, and efficient." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b37dfa9e", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.PlanQuality\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "25af7e17", + "metadata": {}, + "source": [ + "##### Plan adherence metric score\n", + "\n", + "Let's check whether our banking agent follows the plan it created. Deviations lower this score and indicate gaps between reasoning and execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a75445dd", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.PlanAdherence\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " expected_output_column = \"expected_output\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" ] }, { @@ -1408,8 +1464,65 @@ "\n", "*Action* assesses tool usage and argument generation:\n", "\n", - "- **ToolCorrectnessMetric** – Whether the agent selects and calls the right tools.\n", - "- **ArgumentCorrectnessMetric** – Whether the agent generates correct tool arguments." + "- **Tool correctness** – Whether the agent selects and calls the right tools.\n", + "- **Argument correctness** – Whether the agent generates correct tool arguments." + ] + }, + { + "cell_type": "markdown", + "id": "c49015c1", + "metadata": {}, + "source": [ + "##### Tool correctness metric score\n", + "\n", + "Let's evaluate if our banking agent selects the appropriate tool for the task. Choosing the wrong tool reduces performance even if reasoning was correct." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33be2dc8", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.ToolCorrectness\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " expected_tools_column = \"expected_tools\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "4a103a62", + "metadata": {}, + "source": [ + "##### Argument correctness metric score\n", + "\n", + "Let's assesses whether our banking agent provides correct inputs or arguments to the selected tool. Incorrect arguments can lead to failed or unexpected results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "860b318e", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.ArgumentCorrectness\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" ] }, { @@ -1417,12 +1530,93 @@ "id": "3235d4ec", "metadata": {}, "source": [ - "#### Execution layer\n", + "#### Execution layer**\n", "\n", "*Execution* measures end-to-end performance:\n", "\n", - "- **TaskCompletionMetric** – Whether the agent successfully completes the intended task.\n", - "- **StepEfficiencyMetric** – Whether the agent avoids unnecessary or redundant steps." + "- **Task completion** – Whether the agent successfully completes the intended task.\n", + "- **Step efficiency** – Whether the agent avoids unnecessary or redundant steps." + ] + }, + { + "cell_type": "markdown", + "id": "9ea98971", + "metadata": {}, + "source": [ + "##### Task completion metric score\n", + "\n", + "The TaskCompletion test evaluates whether our banking agent successfully completes the requested tasks by analyzing its outputs and tool usage. This metric assesses the agent's ability to understand user requests, execute appropriate actions, and provide complete responses that address the original query. The test provides a score between 0-1 along with detailed feedback on task completion quality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48ac405a", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.TaskCompletion\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c33c568a", + "metadata": {}, + "source": [ + "The TaskCompletion scorer has added a new column 'TaskCompletion_score' to our dataset. This is because when we run scorers through assign_scores(), the return values are automatically processed and added as new columns with the format {scorer_name}_{metric_name}. We'll use this column to visualize the distribution of task completion scores across our test cases. Let's visualize the distribution through the box plot test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f27f3c1", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.plots.BoxPlot\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " params={\n", + " \"columns\": \"TaskCompletion_score\",\n", + " \"title\": \"Distribution of Task Completion Scores\",\n", + " \"ylabel\": \"Score\",\n", + " \"figsize\": (8, 6)\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "19b9c40a", + "metadata": {}, + "source": [ + "##### Step efficiency metric score***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36f8fcf5", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.StepEfficiency\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" ] }, { From 4f2fb514ec5808aa0943e690cca7566c470ba055 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 18:24:43 -0800 Subject: [PATCH 36/54] validmind/scorers/llm/deepeval/StepEfficiency.py --- .../scorers/llm/deepeval/StepEfficiency.py | 110 ++++++++++++++++++ validmind/scorers/llm/deepeval/__init__.py | 2 + 2 files changed, 112 insertions(+) create mode 100644 validmind/scorers/llm/deepeval/StepEfficiency.py diff --git a/validmind/scorers/llm/deepeval/StepEfficiency.py b/validmind/scorers/llm/deepeval/StepEfficiency.py new file mode 100644 index 000000000..32e13469d --- /dev/null +++ b/validmind/scorers/llm/deepeval/StepEfficiency.py @@ -0,0 +1,110 @@ +# Copyright © 2023-2026 ValidMind Inc. All rights reserved. +# Refer to the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial + +from typing import Any, Dict, List + +from validmind import tags, tasks +from validmind.ai.utils import get_client_and_model +from validmind.errors import MissingDependencyError +from validmind.tests.decorator import scorer +from validmind.vm_models.dataset import VMDataset + +try: + from deepeval import evaluate + from deepeval.metrics import StepEfficiencyMetric + from deepeval.test_case import LLMTestCase, ToolCall +except ImportError as e: + if "deepeval" in str(e): + raise MissingDependencyError( + "Missing required package `deepeval` for StepEfficiency. " + "Please run `pip install validmind[llm]` to use LLM tests", + required_dependencies=["deepeval"], + extra="llm", + ) from e + + raise e + + +@scorer() +@tags("llm", "deepeval", "agent_evaluation", "action_layer", "agentic") +@tasks("llm") +def StepEfficiency( + dataset: VMDataset, + threshold: float = 0.5, + input_column: str = "input", + actual_output_column: str = "actual_output", + agent_output_column: str = "agent_output", + tools_called_column: str = "tools_called", + strict_mode: bool = False, +) -> List[Dict[str, Any]]: + """Evaluates agent step efficiency using deepeval's StepEfficiencyMetric. + + This metric evaluates whether the agent avoids unnecessary or redundant steps + in completing the given task. It analyzes the agent's full execution trace + to assess the efficiency of the execution steps. + + Args: + dataset: Dataset containing the agent input and execution trace + threshold: Minimum passing threshold (default: 0.5) + input_column: Column name for the task input (default: "input") + actual_output_column: Column name for the agent's final output (default: "actual_output") + agent_output_column: Column name for agent output containing trace (default: "agent_output") + tools_called_column: Column name for tools called by the agent (default: "tools_called") + strict_mode: If True, enforces a binary score (0 or 1) + + Returns: + List[Dict[str, Any]] with keys "score" and "reason" for each row. + + Raises: + ValueError: If required columns are missing + """ + # Validate required columns exist in dataset + missing_columns: List[str] = [] + if input_column not in dataset._df.columns: + missing_columns.append(input_column) + + if actual_output_column not in dataset._df.columns: + missing_columns.append(actual_output_column) + + if missing_columns: + raise ValueError( + f"Required columns {missing_columns} not found in dataset. " + f"Available columns: {dataset._df.columns.tolist()}" + ) + + _, model = get_client_and_model() + + metric = StepEfficiencyMetric( + threshold=threshold, + model=model, + include_reason=True, + strict_mode=strict_mode, + verbose_mode=False, + ) + + results: List[Dict[str, Any]] = [] + for _, row in dataset._df.iterrows(): + input_value = row[input_column] + actual_output_value = row[actual_output_column] + tools_called_value = row.get(tools_called_column, []) + if not isinstance(tools_called_value, list) or not all( + isinstance(tool, ToolCall) for tool in tools_called_value + ): + from validmind.scorers.llm.deepeval import _convert_to_tool_call_list + + tools_called_value = _convert_to_tool_call_list(tools_called_value) + test_case = LLMTestCase( + input=input_value, + actual_output=actual_output_value, + tools_called=tools_called_value, + _trace_dict=row.get(agent_output_column, {}), + ) + + result = evaluate(test_cases=[test_case], metrics=[metric]) + metric_data = result.test_results[0].metrics_data[0] + score = metric_data.score + reason = getattr(metric_data, "reason", "No reason provided") + results.append({"score": score, "reason": reason}) + + return results diff --git a/validmind/scorers/llm/deepeval/__init__.py b/validmind/scorers/llm/deepeval/__init__.py index 4a1de3536..bcce1be0d 100644 --- a/validmind/scorers/llm/deepeval/__init__.py +++ b/validmind/scorers/llm/deepeval/__init__.py @@ -13,6 +13,7 @@ from .ArgumentCorrectness import ArgumentCorrectness from .PlanAdherence import PlanAdherence from .PlanQuality import PlanQuality +from .StepEfficiency import StepEfficiency from .ToolCorrectness import ToolCorrectness __all__ = [ @@ -20,6 +21,7 @@ "ArgumentCorrectness", "PlanAdherence", "PlanQuality", + "StepEfficiency", "ToolCorrectness", "_extract_tool_responses", "_extract_tool_calls_from_message", From 0a073cdb4a1977474a8a0d9c8583cff84d8a2d6a Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Tue, 27 Jan 2026 18:36:19 -0800 Subject: [PATCH 37/54] validmind/scorers/llm/deepeval/StepEfficiency.py edit --- .../scorers/llm/deepeval/StepEfficiency.py | 59 +++++++++++++++++-- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/validmind/scorers/llm/deepeval/StepEfficiency.py b/validmind/scorers/llm/deepeval/StepEfficiency.py index 32e13469d..be2e2406c 100644 --- a/validmind/scorers/llm/deepeval/StepEfficiency.py +++ b/validmind/scorers/llm/deepeval/StepEfficiency.py @@ -44,6 +44,11 @@ def StepEfficiency( in completing the given task. It analyzes the agent's full execution trace to assess the efficiency of the execution steps. + Note: StepEfficiencyMetric requires a complete execution trace with step-by-step + actions. If the trace structure is incomplete or doesn't contain sufficient + execution steps, the evaluation may fail and return a score of 0.0 with an + explanatory reason. + Args: dataset: Dataset containing the agent input and execution trace threshold: Minimum passing threshold (default: 0.5) @@ -55,6 +60,8 @@ def StepEfficiency( Returns: List[Dict[str, Any]] with keys "score" and "reason" for each row. + If evaluation fails due to incomplete trace structure, returns score 0.0 + with an explanatory reason message. Raises: ValueError: If required columns are missing @@ -94,17 +101,57 @@ def StepEfficiency( from validmind.scorers.llm.deepeval import _convert_to_tool_call_list tools_called_value = _convert_to_tool_call_list(tools_called_value) + + trace_dict = row.get(agent_output_column, {}) + + # StepEfficiencyMetric requires a properly structured trace + # Ensure trace_dict has the necessary structure + if not isinstance(trace_dict, dict): + trace_dict = {} + + # Ensure trace_dict has 'input' and 'output' for task extraction + if "input" not in trace_dict: + trace_dict["input"] = input_value + if "output" not in trace_dict: + trace_dict["output"] = actual_output_value + test_case = LLMTestCase( input=input_value, actual_output=actual_output_value, tools_called=tools_called_value, - _trace_dict=row.get(agent_output_column, {}), + _trace_dict=trace_dict, ) - result = evaluate(test_cases=[test_case], metrics=[metric]) - metric_data = result.test_results[0].metrics_data[0] - score = metric_data.score - reason = getattr(metric_data, "reason", "No reason provided") - results.append({"score": score, "reason": reason}) + try: + result = evaluate(test_cases=[test_case], metrics=[metric]) + metric_data = result.test_results[0].metrics_data[0] + score = metric_data.score + reason = getattr(metric_data, "reason", "No reason provided") + results.append({"score": score, "reason": reason}) + except (UnboundLocalError, AttributeError, KeyError) as e: + # StepEfficiencyMetric may fail if trace structure is incomplete + # This can happen if the trace doesn't contain the required execution steps + error_msg = str(e) + if "prompt" in error_msg or "referenced before assignment" in error_msg: + results.append({ + "score": 0.0, + "reason": ( + f"StepEfficiency evaluation failed: The agent trace may not contain " + f"sufficient execution steps for analysis. StepEfficiencyMetric requires " + f"a complete execution trace with step-by-step actions. " + f"Original error: {error_msg}" + ) + }) + else: + raise + except Exception as e: + # Handle other potential errors gracefully + results.append({ + "score": 0.0, + "reason": ( + f"StepEfficiency evaluation failed: {str(e)}. " + f"This metric requires a properly structured agent execution trace." + ) + }) return results From c106d10ac25668fadd4e781fe1541c86eada0fcc Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 11:13:59 -0800 Subject: [PATCH 38/54] =?UTF-8?q?Running=20evaluation=20tests=20=E2=80=94?= =?UTF-8?q?=20Assign=20AI=20evaluation=20metrics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../code_samples/agents/document_agentic_ai.ipynb | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index b227bacc8..31f6e636d 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1545,7 +1545,7 @@ "source": [ "##### Task completion metric score\n", "\n", - "The TaskCompletion test evaluates whether our banking agent successfully completes the requested tasks by analyzing its outputs and tool usage. This metric assesses the agent's ability to understand user requests, execute appropriate actions, and provide complete responses that address the original query. The test provides a score between 0-1 along with detailed feedback on task completion quality." + "Let's evaluate whether our banking agent successfully completes the requested tasks. Incomplete task execution can lead to user dissatisfaction and failed banking operations." ] }, { @@ -1571,7 +1571,9 @@ "id": "c33c568a", "metadata": {}, "source": [ - "The TaskCompletion scorer has added a new column 'TaskCompletion_score' to our dataset. This is because when we run scorers through assign_scores(), the return values are automatically processed and added as new columns with the format {scorer_name}_{metric_name}. We'll use this column to visualize the distribution of task completion scores across our test cases. Let's visualize the distribution through the box plot test." + "As you recall from the beginning of this section, when we run scorers through `assign_scores()`, the return values are automatically processed and added as new columns with the format `{scorer_name}_{metric_name}`. Note that the task completion scorer has added a new column `TaskCompletion_score` to our dataset.\n", + "\n", + "We'll use this column to visualize the distribution of task completion scores across our test cases through the [BoxPlot test](https://docs.validmind.ai/validmind/validmind/tests/plots/BoxPlot.html#boxplot):" ] }, { @@ -1598,7 +1600,9 @@ "id": "19b9c40a", "metadata": {}, "source": [ - "##### Step efficiency metric score***" + "##### Step efficiency metric score\n", + "\n", + "Let's evaluate whether our banking agent avoids unnecessary or redundant steps during task execution. Inefficient step sequences can lead to increased latency, higher costs, and poor user experience." ] }, { From bdfd87adadb47632df654105af215fcce9b8ed09 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 11:31:05 -0800 Subject: [PATCH 39/54] Save point --- .../agents/document_agentic_ai.ipynb | 67 ++++++++++++++++--- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 31f6e636d..a5cd87fcb 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1386,6 +1386,44 @@ "Together, the below three metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." ] }, + { + "cell_type": "markdown", + "id": "c19a585e", + "metadata": {}, + "source": [ + "#### Discover available scoring methods\n", + "\n", + "Use `list_scorers()` from `validmind.scorers` to discover all available scoring methods that can be used with `assign_scores()`. This function supports filtering by category, task type, or tags to help you find the right scorers for your use case." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "782f0e46", + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.scorers import list_scorers\n", + "\n", + "# List all available scorers\n", + "all_scorers = list_scorers(pretty=False)\n", + "print(f\"Total available scorers: {len(all_scorers)}\\n\")\n", + "\n", + "# List scorers with a formatted table (default)\n", + "scorers_df = list_scorers(pretty=True)\n", + "print(\"Available scorers:\")\n", + "display(scorers_df)\n", + "\n", + "# Filter scorers by category (e.g., LLM/DeepEval scorers)\n", + "llm_scorers = list_scorers(filter=\"deepeval\", pretty=True)\n", + "print(\"\\n\\nDeepEval/LLM agent evaluation scorers:\")\n", + "display(llm_scorers)\n", + "\n", + "# You can also filter by tags or task types\n", + "# Example: list_scorers(tags=[\"classification\"], pretty=True)\n", + "# Example: list_scorers(task=\"classification\", pretty=True)" + ] + }, { "cell_type": "markdown", "id": "334a8456", @@ -1530,7 +1568,7 @@ "id": "3235d4ec", "metadata": {}, "source": [ - "#### Execution layer**\n", + "#### Execution layer\n", "\n", "*Execution* measures end-to-end performance:\n", "\n", @@ -1628,15 +1666,28 @@ "id": "8d80886e", "metadata": {}, "source": [ - "### Run out-of-the box tests" + "### Run out-of-the box RAGAS tests\n", + "\n", + "Finally, we'll run some out-of-the-box *RAGAS* (Retrieval-Augmented Generation Assessment) tests available in the ValidMind Library. RAGAS provides specialized metrics for evaluating retrieval-augmented generation systems and conversational AI agents. These metrics analyze different aspects of agent performance by assessing how well systems integrate retrieved information with generated responses.\n", + "\n", + "Our banking agent uses tools to retrieve information and generates responses based on that context, making it similar to a RAG system. RAGAS metrics help evaluate the quality of this integration by analyzing the relationship between retrieved tool outputs, user queries, and generated responses.\n", + "\n", + "These tests provide insights into how well our banking agent integrates tool usage with conversational abilities, ensuring it provides accurate, relevant, and helpful responses to banking users while maintaining fidelity to retrieved information." ] }, { "cell_type": "markdown", - "id": "9dab68d3", + "id": "f27423e8", "metadata": {}, "source": [ - "### RAGAS tests" + "#### Identify RAGAS tests*\n", + "\n", + "Let's explore some of ValidMind's available tests. Using ValidMind’s repository of tests streamlines your development testing, and helps you ensure that your models are being documented and evaluated appropriately.\n", + "\n", + "We want to narrow down the tests we want to run from the selection provided by ValidMind, so we'll use the [`vm.tests.list_tasks_and_tags()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tasks_and_tags) to list which `tags` are associated with each `task` type:\n", + "\n", + "- **`tasks`** represent the kind of modeling task associated with a test. Here we'll focus on `classification` tasks.\n", + "- **`tags`** are free-form descriptions providing more details about the test, for example, what category the test falls into. Here we'll focus on the `data_quality` tag." ] }, { @@ -1644,7 +1695,7 @@ "id": "6f2b5067", "metadata": {}, "source": [ - "#### Faithfulness" + "##### Faithfulness" ] }, { @@ -1652,7 +1703,7 @@ "id": "fe65be0a", "metadata": {}, "source": [ - "#### Response relevancy" + "##### Response relevancy" ] }, { @@ -1660,7 +1711,7 @@ "id": "9d2cb0b1", "metadata": {}, "source": [ - "#### Context recall" + "##### Context recall" ] }, { @@ -1668,7 +1719,7 @@ "id": "4508379e", "metadata": {}, "source": [ - "#### Safety" + "##### Safety" ] }, { From ebe517eec1730f10f31c9849c55659f8455f4e8b Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 11:42:05 -0800 Subject: [PATCH 40/54] Save point --- .../agents/document_agentic_ai.ipynb | 42 ++++++++++++------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index a5cd87fcb..85e995b69 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1393,7 +1393,7 @@ "source": [ "#### Discover available scoring methods\n", "\n", - "Use `list_scorers()` from `validmind.scorers` to discover all available scoring methods that can be used with `assign_scores()`. This function supports filtering by category, task type, or tags to help you find the right scorers for your use case." + "You can use `list_scorers()` from `validmind.scorers` to discover all available scoring methods that can be used with `assign_scores()`. The function returns a list of scorer IDs. You can filter the results in Python, and use `_load_tests()` and `_pretty_list_tests()` from `validmind.tests.load` to create formatted tables with descriptions." ] }, { @@ -1404,24 +1404,36 @@ "outputs": [], "source": [ "from validmind.scorers import list_scorers\n", + "from validmind.tests.load import _load_tests, _pretty_list_tests\n", "\n", - "# List all available scorers\n", - "all_scorers = list_scorers(pretty=False)\n", + "# List all available scorers (returns a list of scorer IDs)\n", + "all_scorers = list_scorers()\n", "print(f\"Total available scorers: {len(all_scorers)}\\n\")\n", - "\n", - "# List scorers with a formatted table (default)\n", - "scorers_df = list_scorers(pretty=True)\n", - "print(\"Available scorers:\")\n", + "print(\"First 10 scorers:\")\n", + "for scorer_id in all_scorers[:10]:\n", + " print(f\" - {scorer_id}\")\n", + "\n", + "# To get a formatted table with descriptions, load the scorers and format them\n", + "print(\"\\n\\nAvailable scorers (formatted table):\")\n", + "scorers_dict = _load_tests(all_scorers)\n", + "scorers_df = _pretty_list_tests(scorers_dict, truncate=True)\n", "display(scorers_df)\n", "\n", - "# Filter scorers by category (e.g., LLM/DeepEval scorers)\n", - "llm_scorers = list_scorers(filter=\"deepeval\", pretty=True)\n", - "print(\"\\n\\nDeepEval/LLM agent evaluation scorers:\")\n", - "display(llm_scorers)\n", - "\n", - "# You can also filter by tags or task types\n", - "# Example: list_scorers(tags=[\"classification\"], pretty=True)\n", - "# Example: list_scorers(task=\"classification\", pretty=True)" + "# Filter scorers by category (e.g., LLM/DeepEval scorers) - filter in Python\n", + "llm_scorers = [s for s in all_scorers if \"deepeval\" in s.lower()]\n", + "print(f\"\\n\\nDeepEval/LLM agent evaluation scorers ({len(llm_scorers)} found):\")\n", + "for scorer_id in llm_scorers:\n", + " print(f\" - {scorer_id}\")\n", + "\n", + "# To get a formatted table of filtered scorers\n", + "if llm_scorers:\n", + " llm_scorers_dict = _load_tests(llm_scorers)\n", + " llm_scorers_df = _pretty_list_tests(llm_scorers_dict, truncate=True)\n", + " display(llm_scorers_df)\n", + "\n", + "# You can also filter by other criteria\n", + "# Example: Filter by substring in scorer ID\n", + "# classification_scorers = [s for s in all_scorers if \"classification\" in s.lower()]" ] }, { From 859a2cb23002fadc08e965a838f75eac5694c724 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 11:46:05 -0800 Subject: [PATCH 41/54] Save point --- .../agents/document_agentic_ai.ipynb | 31 +++++-------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 85e995b69..22a6b4128 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1393,7 +1393,9 @@ "source": [ "#### Discover available scoring methods\n", "\n", - "You can use `list_scorers()` from `validmind.scorers` to discover all available scoring methods that can be used with `assign_scores()`. The function returns a list of scorer IDs. You can filter the results in Python, and use `_load_tests()` and `_pretty_list_tests()` from `validmind.tests.load` to create formatted tables with descriptions." + "You can use `list_scorers()` from `validmind.scorers` to discover all available scoring methods and their IDs that can be used with `assign_scores()`.\n", + "\n", + "We'll filter these results to return only DeepEval scorers and use `_load_tests()` and `_pretty_list_tests()` from `validmind.tests.load` to create formatted tables with descriptions:" ] }, { @@ -1406,34 +1408,17 @@ "from validmind.scorers import list_scorers\n", "from validmind.tests.load import _load_tests, _pretty_list_tests\n", "\n", - "# List all available scorers (returns a list of scorer IDs)\n", + "# Discover all available scorers\n", "all_scorers = list_scorers()\n", - "print(f\"Total available scorers: {len(all_scorers)}\\n\")\n", - "print(\"First 10 scorers:\")\n", - "for scorer_id in all_scorers[:10]:\n", - " print(f\" - {scorer_id}\")\n", - "\n", - "# To get a formatted table with descriptions, load the scorers and format them\n", - "print(\"\\n\\nAvailable scorers (formatted table):\")\n", - "scorers_dict = _load_tests(all_scorers)\n", - "scorers_df = _pretty_list_tests(scorers_dict, truncate=True)\n", - "display(scorers_df)\n", - "\n", - "# Filter scorers by category (e.g., LLM/DeepEval scorers) - filter in Python\n", + "\n", + "# Filter to DeepEval scorers only\n", "llm_scorers = [s for s in all_scorers if \"deepeval\" in s.lower()]\n", - "print(f\"\\n\\nDeepEval/LLM agent evaluation scorers ({len(llm_scorers)} found):\")\n", - "for scorer_id in llm_scorers:\n", - " print(f\" - {scorer_id}\")\n", "\n", - "# To get a formatted table of filtered scorers\n", + "# Display formatted table with descriptions\n", "if llm_scorers:\n", " llm_scorers_dict = _load_tests(llm_scorers)\n", " llm_scorers_df = _pretty_list_tests(llm_scorers_dict, truncate=True)\n", - " display(llm_scorers_df)\n", - "\n", - "# You can also filter by other criteria\n", - "# Example: Filter by substring in scorer ID\n", - "# classification_scorers = [s for s in all_scorers if \"classification\" in s.lower()]" + " display(llm_scorers_df)" ] }, { From 20ee19670191f67ab575101fce01b645cd18e398 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 12:10:11 -0800 Subject: [PATCH 42/54] Save point --- .../agents/document_agentic_ai.ipynb | 99 +++++++++++++------ 1 file changed, 69 insertions(+), 30 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 22a6b4128..5b8725281 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1368,22 +1368,11 @@ "source": [ "### Assign AI evaluation metric scores\n", "\n", - "*Scorers* are evaluation metrics that analyze model outputs and store their results in the dataset:\n", - "\n", - "- Each scorer adds a new column to the dataset with format: `{scorer_name}_{metric_name}`\n", - "- The column contains the numeric score (typically `0`-`1`) for each example\n", - "- Multiple scorers can be run on the same dataset, each adding their own column\n", - "- Scores are persisted in the dataset for later analysis and visualization\n", - "- Common scorer patterns include:\n", - " - Model performance metrics (accuracy, F1, etc.)\n", - " - Output quality metrics (relevance, faithfulness)\n", - " - Task-specific metrics (completion, correctness)\n", - "\n", "*AI agent evaluation metrics* are specialized measurements designed to assess how well autonomous LLM-based agents reason, plan, select and execute tools, and ultimately complete user tasks by analyzing the *full execution trace* — including reasoning steps, tool calls, intermediate decisions, and outcomes, rather than just single input–output pairs. These metrics are essential because agent failures often occur in ways traditional LLM metrics miss — for example, choosing the right tool with wrong arguments, creating a good plan but not following it, or completing a task inefficiently.\n", "\n", - "We'll use the [`assign_scores()` method](https://docs.validmind.ai/validmind/validmind/tests.html#scorer) in this section to evaluate our banking agent's outputs and add scoring metrics to our sample dataset against metrics defined in [DeepEval’s AI agent evaluation framework](https://deepeval.com/guides/guides-ai-agent-evaluation-metrics) which breaks down AI agent evaluation metrics into three layers with corresponding metric categories.\n", + "In this section, we'll evaluate our banking agent's outputs and add scoring to our sample dataset against metrics defined in [DeepEval’s AI agent evaluation framework](https://deepeval.com/guides/guides-ai-agent-evaluation-metrics) which breaks down AI agent evaluation into three layers with corresponding subcategories: **reasoning**, **action**, and **execution**.\n", "\n", - "Together, the below three metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." + "Together, these three metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." ] }, { @@ -1391,11 +1380,20 @@ "id": "c19a585e", "metadata": {}, "source": [ - "#### Discover available scoring methods\n", + "#### Identify relevant DeepEval scorers\n", "\n", - "You can use `list_scorers()` from `validmind.scorers` to discover all available scoring methods and their IDs that can be used with `assign_scores()`.\n", + "*Scorers* are evaluation metrics that analyze model outputs and store their results in the dataset:\n", "\n", - "We'll filter these results to return only DeepEval scorers and use `_load_tests()` and `_pretty_list_tests()` from `validmind.tests.load` to create formatted tables with descriptions:" + "- Each scorer adds a new column to the dataset with format: `{scorer_name}_{metric_name}`\n", + "- The column contains the numeric score (typically `0`-`1`) for each example\n", + "- Multiple scorers can be run on the same dataset, each adding their own column\n", + "- Scores are persisted in the dataset for later analysis and visualization\n", + "- Common scorer patterns include:\n", + " - Model performance metrics (accuracy, F1, etc.)\n", + " - Output quality metrics (relevance, faithfulness)\n", + " - Task-specific metrics (completion, correctness)\n", + "\n", + "Use `list_scorers()` from [`validmind.scorers`](https://docs.validmind.ai/validmind/validmind/tests.html#scorer) to discover all available scoring methods and their IDs that can be used with `assign_scores()`. We'll filter these results to return only DeepEval scorers for our desired three metrics in a formatted table with descriptions:" ] }, { @@ -1405,20 +1403,61 @@ "metadata": {}, "outputs": [], "source": [ - "from validmind.scorers import list_scorers\n", - "from validmind.tests.load import _load_tests, _pretty_list_tests\n", - "\n", - "# Discover all available scorers\n", - "all_scorers = list_scorers()\n", - "\n", "# Filter to DeepEval scorers only\n", "llm_scorers = [s for s in all_scorers if \"deepeval\" in s.lower()]\n", "\n", - "# Display formatted table with descriptions\n", - "if llm_scorers:\n", - " llm_scorers_dict = _load_tests(llm_scorers)\n", - " llm_scorers_df = _pretty_list_tests(llm_scorers_dict, truncate=True)\n", - " display(llm_scorers_df)" + "# Load all DeepEval scorers\n", + "llm_scorers_dict = vm.tests.load._load_tests(llm_scorers)\n", + "\n", + "# Categorize scorers by metric layer\n", + "reasoning_scorers = {}\n", + "action_scorers = {}\n", + "execution_scorers = {}\n", + "\n", + "for scorer_id, scorer_func in llm_scorers_dict.items():\n", + " tags = getattr(scorer_func, \"__tags__\", [])\n", + " scorer_name = scorer_id.split(\".\")[-1]\n", + "\n", + " if \"reasoning_layer\" in tags:\n", + " reasoning_scorers[scorer_id] = scorer_func\n", + " elif \"action_layer\" in tags:\n", + " # StepEfficiency is tagged as action_layer but belongs to execution per DeepEval framework\n", + " if \"StepEfficiency\" in scorer_name:\n", + " execution_scorers[scorer_id] = scorer_func\n", + " else:\n", + " action_scorers[scorer_id] = scorer_func\n", + " elif \"TaskCompletion\" in scorer_name:\n", + " execution_scorers[scorer_id] = scorer_func\n", + " else:\n", + " other_scorers[scorer_id] = scorer_func\n", + "\n", + "# Display scorers by category\n", + "print(\"=\" * 80)\n", + "print(\"REASONING LAYER\")\n", + "print(\"=\" * 80)\n", + "if reasoning_scorers:\n", + " reasoning_df = vm.tests.load._pretty_list_tests(reasoning_scorers, truncate=True)\n", + " display(reasoning_df)\n", + "else:\n", + " print(\"No reasoning layer scorers found.\")\n", + "\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\"ACTION LAYER\")\n", + "print(\"=\" * 80)\n", + "if action_scorers:\n", + " action_df = vm.tests.load._pretty_list_tests(action_scorers, truncate=True)\n", + " display(action_df)\n", + "else:\n", + " print(\"No action layer scorers found.\")\n", + "\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\"EXECUTION LAYER\")\n", + "print(\"=\" * 80)\n", + "if execution_scorers:\n", + " execution_df = vm.tests.load._pretty_list_tests(execution_scorers, truncate=True)\n", + " display(execution_df)\n", + "else:\n", + " print(\"No execution layer scorers found.\")" ] }, { @@ -1426,7 +1465,7 @@ "id": "334a8456", "metadata": {}, "source": [ - "#### Reasoning layer\n", + "#### Assign Reasoning layer\n", "\n", "*Reasoning* evaluates planning and strategy generation:\n", "\n", @@ -1495,7 +1534,7 @@ "id": "df618f8c", "metadata": {}, "source": [ - "#### Action layer\n", + "#### Assign action scores\n", "\n", "*Action* assesses tool usage and argument generation:\n", "\n", @@ -1565,7 +1604,7 @@ "id": "3235d4ec", "metadata": {}, "source": [ - "#### Execution layer\n", + "#### Assign execution scores\n", "\n", "*Execution* measures end-to-end performance:\n", "\n", From 8dc2a62ec16ceaca75d3706aa2e1bf35d10cc3dc Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 12:45:43 -0800 Subject: [PATCH 43/54] Save point --- .../agents/document_agentic_ai.ipynb | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 5b8725281..9895e71aa 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1403,11 +1403,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Filter to DeepEval scorers only\n", - "llm_scorers = [s for s in all_scorers if \"deepeval\" in s.lower()]\n", - "\n", "# Load all DeepEval scorers\n", - "llm_scorers_dict = vm.tests.load._load_tests(llm_scorers)\n", + "llm_scorers_dict = vm.tests.load._load_tests([s for s in vm.scorer.list_scorers() if \"deepeval\" in s.lower()])\n", "\n", "# Categorize scorers by metric layer\n", "reasoning_scorers = {}\n", @@ -1428,8 +1425,6 @@ " action_scorers[scorer_id] = scorer_func\n", " elif \"TaskCompletion\" in scorer_name:\n", " execution_scorers[scorer_id] = scorer_func\n", - " else:\n", - " other_scorers[scorer_id] = scorer_func\n", "\n", "# Display scorers by category\n", "print(\"=\" * 80)\n", @@ -1704,7 +1699,7 @@ "source": [ "### Run out-of-the box RAGAS tests\n", "\n", - "Finally, we'll run some out-of-the-box *RAGAS* (Retrieval-Augmented Generation Assessment) tests available in the ValidMind Library. RAGAS provides specialized metrics for evaluating retrieval-augmented generation systems and conversational AI agents. These metrics analyze different aspects of agent performance by assessing how well systems integrate retrieved information with generated responses.\n", + "Finally, we'll run some out-of-the-box *Retrieval-Augmented Generation Assessment* (RAGAS) tests available in the ValidMind Library. RAGAS provides specialized metrics for evaluating retrieval-augmented generation systems and conversational AI agents. These metrics analyze different aspects of agent performance by assessing how well systems integrate retrieved information with generated responses.\n", "\n", "Our banking agent uses tools to retrieve information and generates responses based on that context, making it similar to a RAG system. RAGAS metrics help evaluate the quality of this integration by analyzing the relationship between retrieved tool outputs, user queries, and generated responses.\n", "\n", @@ -1716,14 +1711,24 @@ "id": "f27423e8", "metadata": {}, "source": [ - "#### Identify RAGAS tests*\n", + "#### Identify relevant RAGAS tests*\n", "\n", "Let's explore some of ValidMind's available tests. Using ValidMind’s repository of tests streamlines your development testing, and helps you ensure that your models are being documented and evaluated appropriately.\n", "\n", "We want to narrow down the tests we want to run from the selection provided by ValidMind, so we'll use the [`vm.tests.list_tasks_and_tags()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tasks_and_tags) to list which `tags` are associated with each `task` type:\n", "\n", - "- **`tasks`** represent the kind of modeling task associated with a test. Here we'll focus on `classification` tasks.\n", - "- **`tags`** are free-form descriptions providing more details about the test, for example, what category the test falls into. Here we'll focus on the `data_quality` tag." + "- **`tasks`** represent the kind of modeling task associated with a test. Here we'll focus on `model_validation` tasks.\n", + "- **`tags`** are free-form descriptions providing more details about the test, for example, what category the test falls into. Here we'll focus on the `ragas` tag." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dedcaca5", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.list_tests(task=\"model_validation\", tags=[\"ragas\"])" ] }, { @@ -1731,7 +1736,7 @@ "id": "6f2b5067", "metadata": {}, "source": [ - "##### Faithfulness" + "#### Faithfulness" ] }, { @@ -1739,7 +1744,7 @@ "id": "fe65be0a", "metadata": {}, "source": [ - "##### Response relevancy" + "#### Response relevancy" ] }, { @@ -1747,7 +1752,7 @@ "id": "9d2cb0b1", "metadata": {}, "source": [ - "##### Context recall" + "#### Context recall" ] }, { @@ -1755,7 +1760,7 @@ "id": "4508379e", "metadata": {}, "source": [ - "##### Safety" + "#### Safety" ] }, { From f82967ba1690f80c16707f3dbab49e234540f939 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:11:43 -0800 Subject: [PATCH 44/54] Save point --- .../agents/document_agentic_ai.ipynb | 171 ++++++++++++++---- 1 file changed, 138 insertions(+), 33 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 9895e71aa..c09a1bcd5 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1071,20 +1071,12 @@ "vm_test_dataset._df.head()" ] }, - { - "cell_type": "markdown", - "id": "f1deb8b6", - "metadata": {}, - "source": [ - "## Running evaluation tests" - ] - }, { "cell_type": "markdown", "id": "6eab4c9f", "metadata": {}, "source": [ - "### Run custom accuracy tests\n", + "## Running accuracy tests\n", "\n", "Using [`@vm.test`](https://docs.validmind.ai/validmind/validmind.html#test), let's implement some reusable custom *inline tests* to assess the accuracy of our banking agent:\n", "\n", @@ -1097,7 +1089,7 @@ "id": "fe9c4e8b", "metadata": {}, "source": [ - "#### Response accuracy test\n", + "### Response accuracy test\n", "\n", "We'll create a custom test that evaluates the banking agent's ability to provide accurate responses by:\n", "\n", @@ -1213,7 +1205,7 @@ "id": "edf16008", "metadata": {}, "source": [ - "#### Tool selection accuracy test\n", + "### Tool selection accuracy test\n", "\n", "We'll also create a custom test that evaluates the banking agent's ability to select the correct tools for different requests by:\n", "\n", @@ -1366,7 +1358,7 @@ "id": "b455da6e", "metadata": {}, "source": [ - "### Assign AI evaluation metric scores\n", + "## Assigning AI evaluation metric scores\n", "\n", "*AI agent evaluation metrics* are specialized measurements designed to assess how well autonomous LLM-based agents reason, plan, select and execute tools, and ultimately complete user tasks by analyzing the *full execution trace* — including reasoning steps, tool calls, intermediate decisions, and outcomes, rather than just single input–output pairs. These metrics are essential because agent failures often occur in ways traditional LLM metrics miss — for example, choosing the right tool with wrong arguments, creating a good plan but not following it, or completing a task inefficiently.\n", "\n", @@ -1380,7 +1372,7 @@ "id": "c19a585e", "metadata": {}, "source": [ - "#### Identify relevant DeepEval scorers\n", + "### Identify relevant DeepEval scorers\n", "\n", "*Scorers* are evaluation metrics that analyze model outputs and store their results in the dataset:\n", "\n", @@ -1460,7 +1452,7 @@ "id": "334a8456", "metadata": {}, "source": [ - "#### Assign Reasoning layer\n", + "### Assign reasoning scores\n", "\n", "*Reasoning* evaluates planning and strategy generation:\n", "\n", @@ -1473,7 +1465,7 @@ "id": "a35321f1", "metadata": {}, "source": [ - "##### Plan quality metric score\n", + "#### Plan quality score\n", "\n", "Let's measure how well our banking agent generates a plan before acting. A high score means the plan is logical, complete, and efficient." ] @@ -1500,7 +1492,7 @@ "id": "25af7e17", "metadata": {}, "source": [ - "##### Plan adherence metric score\n", + "#### Plan adherence score\n", "\n", "Let's check whether our banking agent follows the plan it created. Deviations lower this score and indicate gaps between reasoning and execution." ] @@ -1529,7 +1521,7 @@ "id": "df618f8c", "metadata": {}, "source": [ - "#### Assign action scores\n", + "### Assign action scores\n", "\n", "*Action* assesses tool usage and argument generation:\n", "\n", @@ -1542,7 +1534,7 @@ "id": "c49015c1", "metadata": {}, "source": [ - "##### Tool correctness metric score\n", + "#### Tool correctness score\n", "\n", "Let's evaluate if our banking agent selects the appropriate tool for the task. Choosing the wrong tool reduces performance even if reasoning was correct." ] @@ -1571,7 +1563,7 @@ "id": "4a103a62", "metadata": {}, "source": [ - "##### Argument correctness metric score\n", + "#### Argument correctness score\n", "\n", "Let's assesses whether our banking agent provides correct inputs or arguments to the selected tool. Incorrect arguments can lead to failed or unexpected results." ] @@ -1599,7 +1591,7 @@ "id": "3235d4ec", "metadata": {}, "source": [ - "#### Assign execution scores\n", + "### Assign execution scores\n", "\n", "*Execution* measures end-to-end performance:\n", "\n", @@ -1612,7 +1604,7 @@ "id": "9ea98971", "metadata": {}, "source": [ - "##### Task completion metric score\n", + "#### Task completion score\n", "\n", "Let's evaluate whether our banking agent successfully completes the requested tasks. Incomplete task execution can lead to user dissatisfaction and failed banking operations." ] @@ -1669,7 +1661,7 @@ "id": "19b9c40a", "metadata": {}, "source": [ - "##### Step efficiency metric score\n", + "#### Step efficiency score\n", "\n", "Let's evaluate whether our banking agent avoids unnecessary or redundant steps during task execution. Inefficient step sequences can lead to increased latency, higher costs, and poor user experience." ] @@ -1697,9 +1689,9 @@ "id": "8d80886e", "metadata": {}, "source": [ - "### Run out-of-the box RAGAS tests\n", + "## Running RAGAS tests\n", "\n", - "Finally, we'll run some out-of-the-box *Retrieval-Augmented Generation Assessment* (RAGAS) tests available in the ValidMind Library. RAGAS provides specialized metrics for evaluating retrieval-augmented generation systems and conversational AI agents. These metrics analyze different aspects of agent performance by assessing how well systems integrate retrieved information with generated responses.\n", + "Next, let's run some out-of-the-box *Retrieval-Augmented Generation Assessment* (RAGAS) tests available in the ValidMind Library. RAGAS provides specialized metrics for evaluating retrieval-augmented generation systems and conversational AI agents. These metrics analyze different aspects of agent performance by assessing how well systems integrate retrieved information with generated responses.\n", "\n", "Our banking agent uses tools to retrieve information and generates responses based on that context, making it similar to a RAG system. RAGAS metrics help evaluate the quality of this integration by analyzing the relationship between retrieved tool outputs, user queries, and generated responses.\n", "\n", @@ -1711,14 +1703,16 @@ "id": "f27423e8", "metadata": {}, "source": [ - "#### Identify relevant RAGAS tests*\n", + "### Identify relevant RAGAS tests\n", "\n", "Let's explore some of ValidMind's available tests. Using ValidMind’s repository of tests streamlines your development testing, and helps you ensure that your models are being documented and evaluated appropriately.\n", "\n", - "We want to narrow down the tests we want to run from the selection provided by ValidMind, so we'll use the [`vm.tests.list_tasks_and_tags()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tasks_and_tags) to list which `tags` are associated with each `task` type:\n", + "You can pass `tasks` and `tags` as parameters to the [`vm.tests.list_tests()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tests) to filter the tests based on the tags and task types:\n", + "\n", + "- **`tasks`** represent the kind of modeling task associated with a test. Here we'll focus on `text_qa` tasks.\n", + "- **`tags`** are free-form descriptions providing more details about the test, for example, what category the test falls into. Here we'll focus on the `ragas` tag.\n", "\n", - "- **`tasks`** represent the kind of modeling task associated with a test. Here we'll focus on `model_validation` tasks.\n", - "- **`tags`** are free-form descriptions providing more details about the test, for example, what category the test falls into. Here we'll focus on the `ragas` tag." + "We'll then run three of these tests returned as examples below." ] }, { @@ -1728,7 +1722,7 @@ "metadata": {}, "outputs": [], "source": [ - "vm.tests.list_tests(task=\"model_validation\", tags=[\"ragas\"])" + "vm.tests.list_tests(task=\"text_qa\", tags=[\"ragas\"])" ] }, { @@ -1736,7 +1730,38 @@ "id": "6f2b5067", "metadata": {}, "source": [ - "#### Faithfulness" + "#### Faithfulness\n", + "\n", + "Faithfulness measures how accurately the banking agent's responses reflect the information retrieved from tools. This metric evaluates:\n", + "\n", + "**Information Accuracy**: Whether the agent correctly uses tool outputs in its responses\n", + "- **Fact Preservation**: Ensuring credit scores, loan calculations, compliance results are accurately reported\n", + "- **No Hallucination**: Verifying the agent doesn't invent banking information not provided by tools\n", + "- **Source Attribution**: Checking that responses align with actual tool outputs\n", + "\n", + "**Critical for Banking Trust**: Faithfulness is essential for banking agent reliability because users need to trust that:\n", + "- Credit analysis results are reported correctly\n", + "- Financial calculations are accurate \n", + "- Compliance checks return real information\n", + "- Risk assessments are properly communicated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f32fa6ef", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.ragas.Faithfulness\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " param_grid={\n", + " \"user_input_column\": [\"input\"],\n", + " \"response_column\": [\"banking_agent_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", + " },\n", + ").log()" ] }, { @@ -1744,7 +1769,39 @@ "id": "fe65be0a", "metadata": {}, "source": [ - "#### Response relevancy" + "#### ResponseRelevancy\n", + "\n", + "Response Relevancy evaluates how well the banking agent's answers address the user's original banking question or request. This metric assesses:\n", + "\n", + "**Query Alignment**: Whether responses directly answer what users asked for\n", + "- **Intent Fulfillment**: Checking if the agent understood and addressed the user's actual banking need\n", + "- **Completeness**: Ensuring responses provide sufficient information to satisfy the banking query\n", + "- **Focus**: Avoiding irrelevant information that doesn't help the banking user\n", + "\n", + "**Banking Quality**: Measures the agent's ability to maintain relevant, helpful banking dialogue\n", + "- **Context Awareness**: Responses should be appropriate for the banking conversation context\n", + "- **User Satisfaction**: Answers should be useful and actionable for banking users\n", + "- **Clarity**: Banking information should be presented in a way that directly helps the user\n", + "\n", + "High relevancy indicates the banking agent successfully understands user needs and provides targeted, helpful banking responses." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f82d1db", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " params={\n", + " \"user_input_column\": \"input\",\n", + " \"response_column\": \"banking_agent_model_prediction\",\n", + " \"retrieved_contexts_column\": \"banking_agent_model_tool_messages\",\n", + " }\n", + ").log()" ] }, { @@ -1752,7 +1809,39 @@ "id": "9d2cb0b1", "metadata": {}, "source": [ - "#### Context recall" + "#### ContextRecall\n", + "\n", + "Context Recall measures how well the banking agent utilizes the information retrieved from tools when generating its responses. This metric evaluates:\n", + "\n", + "**Information Utilization**: Whether the agent effectively incorporates tool outputs into its responses\n", + "- **Coverage**: How much of the available tool information is used in the response\n", + "- **Integration**: How well tool outputs are woven into coherent, natural banking responses\n", + "- **Completeness**: Whether all relevant information from tools is considered\n", + "\n", + "**Tool Effectiveness**: Assesses whether selected banking tools provide useful context for responses\n", + "- **Relevance**: Whether tool outputs actually help answer the user's banking question\n", + "- **Sufficiency**: Whether enough information was retrieved to generate good banking responses\n", + "- **Quality**: Whether the tools provided accurate, helpful banking information\n", + "\n", + "High context recall indicates the banking agent not only selects the right tools but also effectively uses their outputs to create comprehensive, well-informed banking responses." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f6ff7a4", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.ragas.ContextRecall\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " param_grid={\n", + " \"user_input_column\": [\"input\"],\n", + " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", + " \"reference_column\": [\"banking_agent_model_prediction\"],\n", + " },\n", + ").log()" ] }, { @@ -1760,7 +1849,23 @@ "id": "4508379e", "metadata": {}, "source": [ - "#### Safety" + "## Run safety tests\n" + ] + }, + { + "cell_type": "markdown", + "id": "44efe7f0", + "metadata": {}, + "source": [ + "#### AspectCritic" + ] + }, + { + "cell_type": "markdown", + "id": "6ca43e1b", + "metadata": {}, + "source": [ + "#### Bias" ] }, { From 39b3aea7cff812c176bae21ec607de859d7100fe Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:15:54 -0800 Subject: [PATCH 45/54] Running RAGAS tests --- .../agents/document_agentic_ai.ipynb | 47 +++---------------- 1 file changed, 6 insertions(+), 41 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index c09a1bcd5..9a41b69f6 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1732,18 +1732,7 @@ "source": [ "#### Faithfulness\n", "\n", - "Faithfulness measures how accurately the banking agent's responses reflect the information retrieved from tools. This metric evaluates:\n", - "\n", - "**Information Accuracy**: Whether the agent correctly uses tool outputs in its responses\n", - "- **Fact Preservation**: Ensuring credit scores, loan calculations, compliance results are accurately reported\n", - "- **No Hallucination**: Verifying the agent doesn't invent banking information not provided by tools\n", - "- **Source Attribution**: Checking that responses align with actual tool outputs\n", - "\n", - "**Critical for Banking Trust**: Faithfulness is essential for banking agent reliability because users need to trust that:\n", - "- Credit analysis results are reported correctly\n", - "- Financial calculations are accurate \n", - "- Compliance checks return real information\n", - "- Risk assessments are properly communicated" + "Let's evaluate whether the banking agent's responses accurately reflect the information retrieved from tools. Unfaithful responses can misreport credit analysis, financial calculations, and compliance results—undermining user trust in the banking agent." ] }, { @@ -1769,21 +1758,9 @@ "id": "fe65be0a", "metadata": {}, "source": [ - "#### ResponseRelevancy\n", - "\n", - "Response Relevancy evaluates how well the banking agent's answers address the user's original banking question or request. This metric assesses:\n", - "\n", - "**Query Alignment**: Whether responses directly answer what users asked for\n", - "- **Intent Fulfillment**: Checking if the agent understood and addressed the user's actual banking need\n", - "- **Completeness**: Ensuring responses provide sufficient information to satisfy the banking query\n", - "- **Focus**: Avoiding irrelevant information that doesn't help the banking user\n", + "#### Response Relevancy\n", "\n", - "**Banking Quality**: Measures the agent's ability to maintain relevant, helpful banking dialogue\n", - "- **Context Awareness**: Responses should be appropriate for the banking conversation context\n", - "- **User Satisfaction**: Answers should be useful and actionable for banking users\n", - "- **Clarity**: Banking information should be presented in a way that directly helps the user\n", - "\n", - "High relevancy indicates the banking agent successfully understands user needs and provides targeted, helpful banking responses." + "Let's evaluate whether the banking agent's answers address the user's original question or request. Irrelevant or off-topic responses can frustrate users and fail to deliver the banking information they need." ] }, { @@ -1809,21 +1786,9 @@ "id": "9d2cb0b1", "metadata": {}, "source": [ - "#### ContextRecall\n", - "\n", - "Context Recall measures how well the banking agent utilizes the information retrieved from tools when generating its responses. This metric evaluates:\n", - "\n", - "**Information Utilization**: Whether the agent effectively incorporates tool outputs into its responses\n", - "- **Coverage**: How much of the available tool information is used in the response\n", - "- **Integration**: How well tool outputs are woven into coherent, natural banking responses\n", - "- **Completeness**: Whether all relevant information from tools is considered\n", - "\n", - "**Tool Effectiveness**: Assesses whether selected banking tools provide useful context for responses\n", - "- **Relevance**: Whether tool outputs actually help answer the user's banking question\n", - "- **Sufficiency**: Whether enough information was retrieved to generate good banking responses\n", - "- **Quality**: Whether the tools provided accurate, helpful banking information\n", + "#### Context Recall\n", "\n", - "High context recall indicates the banking agent not only selects the right tools but also effectively uses their outputs to create comprehensive, well-informed banking responses." + "Let's evaluate how well the banking agent uses the information retrieved from tools when generating its responses. Poor context recall can lead to incomplete or underinformed answers even when the right tools were selected." ] }, { @@ -1849,7 +1814,7 @@ "id": "4508379e", "metadata": {}, "source": [ - "## Run safety tests\n" + "## Running safety tests\n" ] }, { From 49fe39c417ac9c844ce859ba449bd76f30b90e2f Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:30:44 -0800 Subject: [PATCH 46/54] Running safety tests --- .../agents/document_agentic_ai.ipynb | 71 ++++++++++++++++++- 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 9a41b69f6..637fc2ded 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1814,7 +1814,13 @@ "id": "4508379e", "metadata": {}, "source": [ - "## Running safety tests\n" + "## Running safety tests\n", + "\n", + "Finally, let's run some out-of-the-box *safety* tests available in the ValidMind Library. Safety tests provide specialized metrics for evaluating whether AI agents operate reliably and securely. These metrics analyze different aspects of agent behavior by assessing adherence to safety guidelines, consistency of outputs, and resistance to harmful or inappropriate requests.\n", + "\n", + "Our banking agent handles sensitive financial information and user requests, making safety and reliability essential. Safety tests help evaluate whether the agent maintains appropriate boundaries, responds consistently and correctly to inputs, and avoids generating harmful, biased, or unprofessional content.\n", + "\n", + "These tests provide insights into how well our banking agent upholds standards of fairness and professionalism, ensuring it operates reliably and securely for banking users." ] }, { @@ -1822,7 +1828,29 @@ "id": "44efe7f0", "metadata": {}, "source": [ - "#### AspectCritic" + "#### AspectCritic\n", + "\n", + "Let's evaluate our banking agent's responses across multiple quality dimensions — conciseness, coherence, correctness, harmfulness, and maliciousness. Weak performance on these dimensions can degrade user experience, fall short of professional banking standards, or introduce safety risks. \n", + "\n", + "We'll use the `AspectCritic` we identified earlier:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c14b4fb5", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.ragas.AspectCritic\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " param_grid={\n", + " \"user_input_column\": [\"input\"],\n", + " \"response_column\": [\"banking_agent_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", + " },\n", + ").log()" ] }, { @@ -1830,7 +1858,44 @@ "id": "6ca43e1b", "metadata": {}, "source": [ - "#### Bias" + "#### Bias\n", + "\n", + "Let's evaluate whether our banking agent's prompts contain unintended biases that could affect banking decisions. Biased prompts can lead to unfair or discriminatory outcomes — undermining customer trust and exposing the institution to compliance risk.\n", + "\n", + "We'll first use `list_tests()` again to filter for tests relating to `prompt_validation`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f0050da", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.list_tests(filter=\"prompt_validation\")" + ] + }, + { + "cell_type": "markdown", + "id": "5a0cfc84", + "metadata": {}, + "source": [ + "And then run the identified `Bias` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f612c6b2", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Bias\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" ] }, { From 384e70e667cf0bbd28d57b1c0f591766ec7724a3 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:48:09 -0800 Subject: [PATCH 47/54] CLeaning up intro --- .../agents/document_agentic_ai.ipynb | 53 +------------------ 1 file changed, 1 insertion(+), 52 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 637fc2ded..2c5082016 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -7,7 +7,7 @@ "source": [ "# Document an agentic AI system\n", "\n", - "Build and document an agentic AI system with the ValidMind Library. Construct a LangGraph-based banking agent, prepare your datasets and agent for testing, run out-of-the-box and custom tests and log those test results to the ValidMind Platform.\n", + "Build and document an agentic AI system with the ValidMind Library. Construct a LangGraph-based banking agent, assign AI evaluation metric scores to your agent, and run accuracy, RAGAS, and safety tests, then log those test results to the ValidMind Platform.\n", "\n", "An _AI agent_ is an autonomous system that interprets inputs, selects from available tools or actions, and executes multi-step behaviors to achieve defined goals. In this notebook, the agent acts as a banking assistant that analyzes user requests and automatically selects and invokes the appropriate specialized banking tool to deliver accurate, compliant, and actionable responses.\n", "\n", @@ -1898,57 +1898,6 @@ ").log()" ] }, - { - "cell_type": "markdown", - "id": "b62f3cc9", - "metadata": {}, - "source": [ - "## In summary**\n", - "\n", - "We have successfully built and tested a comprehensive **Banking AI Agent** using LangGraph and ValidMind. Here's what we've accomplished:\n", - "\n", - "\n", - "\n", - "### What We Built\n", - "\n", - "1. **5 Specialized Banking Tools**\n", - " - Credit Risk Analyzer for loan assessments\n", - " - Customer Account Manager for account services\n", - " - Fraud Detection System for security monitoring\n", - "\n", - "2. **Intelligent LangGraph Agent**\n", - " - Automatic tool selection based on user requests\n", - " - Banking-specific system prompts and guidance\n", - " - Professional banking assistance and responses\n", - "\n", - "3. **Comprehensive Testing Framework**\n", - " - banking-specific test cases\n", - " - ValidMind integration for validation\n", - " - Performance analysis across banking domains\n", - "\n", - "\n", - "\n", - "### Next Steps\n", - "\n", - "1. **Customize Tools**: Adapt the banking tools to your specific banking requirements\n", - "2. **Expand Test Cases**: Add more banking scenarios and edge cases\n", - "3. **Integrate with Real Data**: Connect to actual banking systems and databases\n", - "4. **Add More Tools**: Implement additional banking-specific functionality\n", - "5. **Production Deployment**: Deploy the agent in a production banking environment\n", - "\n", - "\n", - "\n", - "### Key Benefits\n", - "\n", - "- **Industry-Specific**: Designed specifically for banking operations\n", - "- **Regulatory Compliance**: Built-in SR 11-7 and SS 1-23 compliance checks\n", - "- **Risk Management**: Comprehensive credit and fraud risk assessment\n", - "- **Customer Focus**: Tools for both retail and commercial banking needs\n", - "- **Real-World Applicability**: Addresses actual banking use cases and challenges\n", - "\n", - "Your banking AI agent is now ready to handle real-world banking scenarios while maintaining regulatory compliance and risk management best practices!" - ] - }, { "cell_type": "markdown", "id": "next-steps-24e25294-3e59-4982-92d0-a998cfbe3bb5", From e7e1b0ecb3b9f40d0ac9c1e1df0abeb218c7e6ad Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:54:07 -0800 Subject: [PATCH 48/54] Save point --- .../agents/document_agentic_ai.ipynb | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 2c5082016..6bc99ada4 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1922,6 +1922,28 @@ "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)" ] }, + { + "cell_type": "markdown", + "id": "002a6b1d", + "metadata": {}, + "source": [ + "### Customize the banking agent for your use case\n", + "\n", + "Key benefits:\n", + "\n", + "- **Industry-Specific**: Designed specifically for banking operations\n", + "- **Regulatory Compliance**: Built-in SR 11-7 and SS 1-23 compliance checks\n", + "- **Risk Management**: Comprehensive credit and fraud risk assessment\n", + "- **Customer Focus**: Tools for both retail and commercial banking needs\n", + "- **Real-World Applicability**: Addresses actual banking use cases and challenges\n", + "\n", + "1. **Customize Tools**: Adapt the banking tools to your specific banking requirements\n", + "2. **Expand Test Cases**: Add more banking scenarios and edge cases\n", + "3. **Integrate with Real Data**: Connect to actual banking systems and databases\n", + "4. **Add More Tools**: Implement additional banking-specific functionality\n", + "5. **Production Deployment**: Deploy the agent in a production banking environmen" + ] + }, { "cell_type": "markdown", "id": "next-resources-779f3903-eb13-4242-9438-e0dfc1753d4d", @@ -1929,7 +1951,13 @@ "source": [ "### Discover more learning resources\n", "\n", - "We offer many interactive notebooks to help you document models:\n", + "Learn more about the ValidMind Library tools we used in this notebook:\n", + "\n", + "- [Custom prompts](https://docs.validmind.ai/notebooks/how_to/customize_test_result_descriptions.html)\n", + "- [Custom tests](https://docs.validmind.ai/notebooks/code_samples/custom_tests/implement_custom_tests.html)\n", + "- [ValidMind scorers](https://docs.validmind.ai/notebooks/how_to/assign_scores_complete_tutorial.html)\n", + "\n", + "We also offer many more interactive notebooks to help you document models:\n", "\n", "- [Run tests & test suites](https://docs.validmind.ai/guide/testing-overview.html)\n", "- [Code samples](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", From 93ecd4746f65f5d1e835fb43dffb20ed5a48edcb Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 14:03:22 -0800 Subject: [PATCH 49/54] Cleanup: Next steps --- .../agents/document_agentic_ai.ipynb | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 6bc99ada4..3f75fe622 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1919,7 +1919,9 @@ "\n", "2. In the left sidebar that appears for your model, click **Documentation** under Documents.\n", "\n", - "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)" + " What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)\n", + "\n", + "3. Click into any section related to the tests we ran in this notebook, for example: **4.3. Prompt Evaluation** to review the results of the tests we logged." ] }, { @@ -1929,19 +1931,12 @@ "source": [ "### Customize the banking agent for your use case\n", "\n", - "Key benefits:\n", - "\n", - "- **Industry-Specific**: Designed specifically for banking operations\n", - "- **Regulatory Compliance**: Built-in SR 11-7 and SS 1-23 compliance checks\n", - "- **Risk Management**: Comprehensive credit and fraud risk assessment\n", - "- **Customer Focus**: Tools for both retail and commercial banking needs\n", - "- **Real-World Applicability**: Addresses actual banking use cases and challenges\n", + "You've now built an agentic AI system designed for banking use cases that supports compliance with supervisory guidance such as SR 11-7 and SS1/23, covering credit and fraud risk assessment for both retail and commercial banking. Extend this example agent to real-world banking scenarios and production deployment by:\n", "\n", - "1. **Customize Tools**: Adapt the banking tools to your specific banking requirements\n", - "2. **Expand Test Cases**: Add more banking scenarios and edge cases\n", - "3. **Integrate with Real Data**: Connect to actual banking systems and databases\n", - "4. **Add More Tools**: Implement additional banking-specific functionality\n", - "5. **Production Deployment**: Deploy the agent in a production banking environmen" + "- Adapting the banking tools to your organization's specific requirements\n", + "- Adding more banking scenarios and edge cases to your test set\n", + "- Connecting the agent to your banking systems and databases\n", + "- Implementing additional banking-specific tools and workflows" ] }, { From 15879de547ed5121ba1e3231b68a4acabc6a1423 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 14:17:30 -0800 Subject: [PATCH 50/54] Removing old notebook & adding toc --- .../agents/document_agentic_ai.ipynb | 384 +++-- .../langgraph_agent_simple_banking_demo.ipynb | 1501 ----------------- 2 files changed, 264 insertions(+), 1621 deletions(-) delete mode 100644 notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index 3f75fe622..efadb761e 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "382caa31", + "id": "e7277c38", "metadata": {}, "source": [ "# Document an agentic AI system\n", @@ -21,9 +21,87 @@ }, { "cell_type": "markdown", - "id": "about-intro-d367bebc-fcd0-46df-93ba-5f731f4b9ba9", - "metadata": {}, - "source": [ + "id": "a47dd942", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1__) \n", + " - [Before you begin](#toc1_1__) \n", + " - [New to ValidMind?](#toc1_2__) \n", + " - [Key concepts](#toc1_3__) \n", + "- [Setting up](#toc2__) \n", + " - [Install the ValidMind Library](#toc2_1__) \n", + " - [Initialize the ValidMind Library](#toc2_2__) \n", + " - [Register sample model](#toc2_2_1__) \n", + " - [Apply documentation template](#toc2_2_2__) \n", + " - [Get your code snippet](#toc2_2_3__) \n", + " - [Preview the documentation template](#toc2_2_4__) \n", + " - [Verify OpenAI API access](#toc2_3__) \n", + " - [Initialize the Python environment](#toc2_4__) \n", + "- [Building the LangGraph agent](#toc3__) \n", + " - [Test available banking tools](#toc3_1__) \n", + " - [Create LangGraph banking agent](#toc3_2__) \n", + " - [Define system prompt](#toc3_2_1__) \n", + " - [Initialize the LLM](#toc3_2_2__) \n", + " - [Define agent state structure](#toc3_2_3__) \n", + " - [Create agent workflow function](#toc3_2_4__) \n", + " - [Instantiate the banking agent](#toc3_2_5__) \n", + " - [Integrate agent with ValidMind](#toc3_3__) \n", + " - [Import ValidMind components](#toc3_3_1__) \n", + " - [Create agent wrapper function](#toc3_3_2__) \n", + " - [Initialize the ValidMind model object](#toc3_3_3__) \n", + " - [Store the agent reference](#toc3_3_4__) \n", + " - [Verify integration](#toc3_3_5__) \n", + " - [Validate the system prompt](#toc3_4__) \n", + "- [Initialize the ValidMind datasets](#toc4__) \n", + " - [Assign predictions](#toc4_1__) \n", + "- [Running accuracy tests](#toc5__) \n", + " - [Response accuracy test](#toc5_1__) \n", + " - [Tool selection accuracy test](#toc5_2__) \n", + "- [Assigning AI evaluation metric scores](#toc6__) \n", + " - [Identify relevant DeepEval scorers](#toc6_1__) \n", + " - [Assign reasoning scores](#toc6_2__) \n", + " - [Plan quality score](#toc6_2_1__) \n", + " - [Plan adherence score](#toc6_2_2__) \n", + " - [Assign action scores](#toc6_3__) \n", + " - [Tool correctness score](#toc6_3_1__) \n", + " - [Argument correctness score](#toc6_3_2__) \n", + " - [Assign execution scores](#toc6_4__) \n", + " - [Task completion score](#toc6_4_1__) \n", + " - [Step efficiency score](#toc6_4_2__) \n", + "- [Running RAGAS tests](#toc7__) \n", + " - [Identify relevant RAGAS tests](#toc7_1__) \n", + " - [Faithfulness](#toc7_1_1__) \n", + " - [Response Relevancy](#toc7_1_2__) \n", + " - [Context Recall](#toc7_1_3__) \n", + "- [Running safety tests](#toc8__) \n", + " - [AspectCritic](#toc8_1_1__) \n", + " - [Bias](#toc8_1_2__) \n", + "- [Next steps](#toc9__) \n", + " - [Work with your model documentation](#toc9_1__) \n", + " - [Customize the banking agent for your use case](#toc9_2__) \n", + " - [Discover more learning resources](#toc9_3__) \n", + "- [Upgrade ValidMind](#toc10__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "ecaad35f", + "metadata": {}, + "source": [ + "\n", + "\n", "## About ValidMind\n", "\n", "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. \n", @@ -33,9 +111,11 @@ }, { "cell_type": "markdown", - "id": "about-begin-c1d3d399-73ca-4a88-9ca8-16603eab0751", + "id": "6ff1f9ef", "metadata": {}, "source": [ + "\n", + "\n", "### Before you begin\n", "\n", "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", @@ -45,9 +125,11 @@ }, { "cell_type": "markdown", - "id": "about-signup-65fed156-8e69-49eb-b04e-ae52c574d8e7", + "id": "d7ad8d8c", "metadata": {}, "source": [ + "\n", + "\n", "### New to ValidMind?\n", "\n", "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", @@ -59,9 +141,11 @@ }, { "cell_type": "markdown", - "id": "about-concepts-f49034ef-bfb8-4643-88dc-5647eb76968a", + "id": "323caa59", "metadata": {}, "source": [ + "\n", + "\n", "### Key concepts\n", "\n", "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", @@ -92,17 +176,20 @@ }, { "cell_type": "markdown", - "id": "cell-d35096df-f3f8-4034-87fa-1973bbdd7b49-efbed180-2181-4735-83c9-7d12b69b9f4c", + "id": "ddba5169", "metadata": {}, "source": [ + "\n", + "\n", "## Setting up" ] }, { "cell_type": "markdown", - "id": "install-library-20ef5375-d71d-47b1-b669-ed90cb3f0e90", "metadata": {}, "source": [ + "\n", + "\n", "### Install the ValidMind Library\n", "\n", "
Recommended Python versions\n", @@ -115,7 +202,7 @@ { "cell_type": "code", "execution_count": null, - "id": "install-python-94668f4a-272a-4d93-b7c2-da735557fd37", + "id": "1982a118", "metadata": {}, "outputs": [], "source": [ @@ -124,17 +211,21 @@ }, { "cell_type": "markdown", - "id": "install-initialize-13c50d00-6167-4147-a886-541d73129898", + "id": "dc9dea3a", "metadata": {}, "source": [ + "\n", + "\n", "### Initialize the ValidMind Library" ] }, { "cell_type": "markdown", - "id": "install-register-ab56529b-d2f2-4942-b18e-603fb7342e5b", + "id": "5848461e", "metadata": {}, "source": [ + "\n", + "\n", "#### Register sample model\n", "\n", "Let's first register a sample model for use with this notebook.\n", @@ -152,9 +243,11 @@ }, { "cell_type": "markdown", - "id": "install-template-6e8d6279-0edf-442a-bb63-50f7f89e60b3", + "id": "97d0b04b", "metadata": {}, "source": [ + "\n", + "\n", "#### Apply documentation template\n", "\n", "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", @@ -168,7 +261,7 @@ }, { "cell_type": "markdown", - "id": "e4a16ffa", + "id": "b279d5fa", "metadata": {}, "source": [ "
Can't select this template?\n", @@ -182,9 +275,10 @@ }, { "cell_type": "markdown", - "id": "install-snippet-468eb7c3-612b-4f14-96aa-dc5a3088448c", "metadata": {}, "source": [ + "\n", + "\n", "#### Get your code snippet\n", "\n", "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", @@ -196,7 +290,7 @@ { "cell_type": "code", "execution_count": null, - "id": "install-init-9b9b870b-01e8-428d-afbf-8e3d9771b3f4", + "id": "d6ccbefc", "metadata": {}, "outputs": [], "source": [ @@ -219,9 +313,10 @@ }, { "cell_type": "markdown", - "id": "install-preview-4e403693-e030-4b31-9d39-74a6af8f470f", "metadata": {}, "source": [ + "\n", + "\n", "#### Preview the documentation template\n", "\n", "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", @@ -232,7 +327,7 @@ { "cell_type": "code", "execution_count": null, - "id": "install-preview-template-b5149386-4ab1-41cb-9527-880573d91509", + "id": "dffdaa6f", "metadata": {}, "outputs": [], "source": [ @@ -241,9 +336,10 @@ }, { "cell_type": "markdown", - "id": "ba45feba", "metadata": {}, "source": [ + "\n", + "\n", "### Verify OpenAI API access\n", "\n", "Verify that a valid `OPENAI_API_KEY` is set in your `.env` file:" @@ -252,7 +348,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9684fde1", + "id": "22cc39cb", "metadata": {}, "outputs": [], "source": [ @@ -266,9 +362,10 @@ }, { "cell_type": "markdown", - "id": "679111bb", "metadata": {}, "source": [ + "\n", + "\n", "### Initialize the Python environment\n", "\n", "Let's import all the necessary libraries to prepare for building our banking LangGraph agentic system:\n", @@ -283,7 +380,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3a64a021", + "id": "2058d1ac", "metadata": {}, "outputs": [], "source": [ @@ -332,17 +429,20 @@ }, { "cell_type": "markdown", - "id": "cf6ebc6c", + "id": "e109d075", "metadata": {}, "source": [ + "\n", + "\n", "## Building the LangGraph agent" ] }, { "cell_type": "markdown", - "id": "bf4fc0d7", "metadata": {}, "source": [ + "\n", + "\n", "### Test available banking tools\n", "\n", "We'll use the demo banking tools defined in `banking_tools.py` that provide use cases of financial services:\n", @@ -355,7 +455,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0c862fdd", + "id": "1e0a120c", "metadata": {}, "outputs": [], "source": [ @@ -367,7 +467,6 @@ }, { "cell_type": "markdown", - "id": "4d6f0e26", "metadata": {}, "source": [ "Let's test each banking tool individually to ensure they're working correctly before integrating them into our agent:" @@ -437,7 +536,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8442bf81", + "id": "a983b30d", "metadata": {}, "outputs": [], "source": [ @@ -464,9 +563,11 @@ }, { "cell_type": "markdown", - "id": "5ed83560", + "id": "6bf04845", "metadata": {}, "source": [ + "\n", + "\n", "### Create LangGraph banking agent\n", "\n", "With our tools ready to go, we'll create our intelligent banking agent with LangGraph that automatically selects and uses the appropriate banking tool based on a user request." @@ -474,9 +575,10 @@ }, { "cell_type": "markdown", - "id": "6a5beb28", "metadata": {}, "source": [ + "\n", + "\n", "#### Define system prompt\n", "\n", "We'll begin by defining our system prompt, which provides the LLM with context about its role as a banking assistant and guidance on when to use each available tool:" @@ -485,7 +587,7 @@ { "cell_type": "code", "execution_count": null, - "id": "64f46a1c", + "id": "7971c427", "metadata": {}, "outputs": [], "source": [ @@ -531,9 +633,10 @@ }, { "cell_type": "markdown", - "id": "3f8da88d", "metadata": {}, "source": [ + "\n", + "\n", "#### Initialize the LLM\n", "\n", "Let's initialize the LLM that will power our banking agent:" @@ -542,7 +645,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b828d70", + "id": "866066e7", "metadata": {}, "outputs": [], "source": [ @@ -558,7 +661,6 @@ }, { "cell_type": "markdown", - "id": "866b59cb", "metadata": {}, "source": [ "Then bind the available banking tools to the LLM, enabling the model to automatically recognize and invoke each tool when appropriate based on request input and the system prompt we defined above:" @@ -567,7 +669,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65f85d86", + "id": "906d8132", "metadata": {}, "outputs": [], "source": [ @@ -577,9 +679,10 @@ }, { "cell_type": "markdown", - "id": "5f898062", "metadata": {}, "source": [ + "\n", + "\n", "#### Define agent state structure\n", "\n", "The agent state defines the data structure that flows through the LangGraph workflow. It includes:\n", @@ -595,7 +698,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c7feeebb", + "id": "6b926ddf", "metadata": {}, "outputs": [], "source": [ @@ -609,9 +712,10 @@ }, { "cell_type": "markdown", - "id": "31b261b2", "metadata": {}, "source": [ + "\n", + "\n", "#### Create agent workflow function\n", "\n", "We'll build the LangGraph agent workflow with two main components:\n", @@ -625,7 +729,7 @@ { "cell_type": "code", "execution_count": null, - "id": "142c20b9", + "id": "2c9bf585", "metadata": {}, "outputs": [], "source": [ @@ -675,9 +779,10 @@ }, { "cell_type": "markdown", - "id": "0f19d4b7", "metadata": {}, "source": [ + "\n", + "\n", "#### Instantiate the banking agent\n", "\n", "Now, we'll create an instance of the banking agent by calling the workflow creation function.\n", @@ -688,7 +793,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aabb7842", + "id": "455b8ee4", "metadata": {}, "outputs": [], "source": [ @@ -706,9 +811,11 @@ }, { "cell_type": "markdown", - "id": "cfd302bb", + "id": "12691528", "metadata": {}, "source": [ + "\n", + "\n", "### Integrate agent with ValidMind\n", "\n", "To integrate our LangGraph banking agent with ValidMind, we need to create a wrapper function that ValidMind can use to invoke the agent and extract the necessary information for testing and documentation, allowing ValidMind to run validation tests on the agent's behavior, tool usage, and responses." @@ -716,9 +823,10 @@ }, { "cell_type": "markdown", - "id": "e2540236", "metadata": {}, "source": [ + "\n", + "\n", "#### Import ValidMind components\n", "\n", "We'll start with importing the necessary ValidMind components for integrating our agent:\n", @@ -730,7 +838,7 @@ { "cell_type": "code", "execution_count": null, - "id": "67557905", + "id": "9aeb8969", "metadata": {}, "outputs": [], "source": [ @@ -740,9 +848,10 @@ }, { "cell_type": "markdown", - "id": "c30dd6b1", "metadata": {}, "source": [ + "\n", + "\n", "#### Create agent wrapper function\n", "\n", "We'll then create a wrapper function that:\n", @@ -757,7 +866,7 @@ { "cell_type": "code", "execution_count": null, - "id": "db1fcc20", + "id": "0e4d5a82", "metadata": {}, "outputs": [], "source": [ @@ -821,11 +930,11 @@ }, { "cell_type": "markdown", - "id": "4ea44f1e", "metadata": {}, "source": [ - "#### Initialize the ValidMind model object\n", + "\n", "\n", + "#### Initialize the ValidMind model object\n", "\n", "We'll also need to register the banking agent as a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data.\n", "\n", @@ -840,7 +949,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a4389e36", + "id": "60a2ce7a", "metadata": {}, "outputs": [], "source": [ @@ -854,9 +963,10 @@ }, { "cell_type": "markdown", - "id": "cd6eb68b", "metadata": {}, "source": [ + "\n", + "\n", "#### Store the agent reference\n", "\n", "We'll also store a reference to the original banking agent object in the ValidMind model. This allows us to access the full agent functionality directly if needed, while still maintaining the wrapper function interface for ValidMind's testing framework." @@ -865,7 +975,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8e39d400", + "id": "2c653471", "metadata": {}, "outputs": [], "source": [ @@ -875,9 +985,10 @@ }, { "cell_type": "markdown", - "id": "2db4b849", "metadata": {}, "source": [ + "\n", + "\n", "#### Verify integration\n", "\n", "Let's confirm that the banking agent has been successfully integrated with ValidMind:" @@ -886,7 +997,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59afbb6d", + "id": "8e101b0f", "metadata": {}, "outputs": [], "source": [ @@ -896,9 +1007,10 @@ }, { "cell_type": "markdown", - "id": "af84f571", "metadata": {}, "source": [ + "\n", + "\n", "### Validate the system prompt\n", "\n", "Let's get an initial sense of how well our defined system prompt meets a few best practices for prompt engineering by running a few tests — we'll run evaluation tests later on our agent's performance.\n", @@ -975,7 +1087,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f3bd1038", + "id": "bba99915", "metadata": {}, "outputs": [], "source": [ @@ -989,9 +1101,10 @@ }, { "cell_type": "markdown", - "id": "9035ae24", "metadata": {}, "source": [ + "\n", + "\n", "## Initialize the ValidMind datasets\n", "\n", "After validation our system prompt, let's import our sample dataset ([banking_test_dataset.py](banking_test_dataset.py)), which we'll use in the next section to evaluate our agent's performance across different banking scenarios:" @@ -1000,7 +1113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b0620699", + "id": "0c70ca2c", "metadata": {}, "outputs": [], "source": [ @@ -1009,7 +1122,6 @@ }, { "cell_type": "markdown", - "id": "22f93945", "metadata": {}, "source": [ "The next step is to connect your data with a ValidMind `Dataset` object. **This step is always necessary every time you want to connect a dataset to documentation and produce test results through ValidMind,** but you only need to do it once per dataset.\n", @@ -1019,13 +1131,13 @@ "- **`input_id`** — A unique identifier that allows tracking what inputs are used when running each individual test.\n", "- **`dataset`** — The raw dataset that you want to provide as input to tests.\n", "- **`text_column`** — The name of the column containing the text input data.\n", - "- **`target_column`** — A required argument if tests require access to true values. This is the name of the target column in the dataset.\n" + "- **`target_column`** — A required argument if tests require access to true values. This is the name of the target column in the dataset." ] }, { "cell_type": "code", "execution_count": null, - "id": "b90a7dfd", + "id": "a7e9d158", "metadata": {}, "outputs": [], "source": [ @@ -1044,9 +1156,10 @@ }, { "cell_type": "markdown", - "id": "8b7da187", "metadata": {}, "source": [ + "\n", + "\n", "### Assign predictions\n", "\n", "Now that both the model object and the datasets have been registered, we'll assign predictions to capture the banking agent's responses for evaluation:\n", @@ -1060,7 +1173,7 @@ { "cell_type": "code", "execution_count": null, - "id": "83350c38", + "id": "1d462663", "metadata": {}, "outputs": [], "source": [ @@ -1073,9 +1186,11 @@ }, { "cell_type": "markdown", - "id": "6eab4c9f", + "id": "8e50467e", "metadata": {}, "source": [ + "\n", + "\n", "## Running accuracy tests\n", "\n", "Using [`@vm.test`](https://docs.validmind.ai/validmind/validmind.html#test), let's implement some reusable custom *inline tests* to assess the accuracy of our banking agent:\n", @@ -1086,9 +1201,10 @@ }, { "cell_type": "markdown", - "id": "fe9c4e8b", "metadata": {}, "source": [ + "\n", + "\n", "### Response accuracy test\n", "\n", "We'll create a custom test that evaluates the banking agent's ability to provide accurate responses by:\n", @@ -1102,7 +1218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "335aeedc", + "id": "90232066", "metadata": {}, "outputs": [], "source": [ @@ -1144,7 +1260,6 @@ }, { "cell_type": "markdown", - "id": "f3ca07d3", "metadata": {}, "source": [ "Now that we've defined our custom response accuracy test, we can run the test using the same `run_test()` function we used earlier to validate the system prompt using our sample dataset and agentic model as input, and log the test results to the ValidMind Platform with the [`log()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#log):" @@ -1153,7 +1268,7 @@ { "cell_type": "code", "execution_count": null, - "id": "567a058a", + "id": "e68884d5", "metadata": {}, "outputs": [], "source": [ @@ -1172,7 +1287,6 @@ }, { "cell_type": "markdown", - "id": "397b3a0d", "metadata": {}, "source": [ "Let's review the first five rows of the test dataset to inspect the results to see how well the banking agent performed. Each column in the output serves a specific purpose in evaluating agent performance:\n", @@ -1193,7 +1307,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cd9a2155", + "id": "78f7edb1", "metadata": {}, "outputs": [], "source": [ @@ -1202,9 +1316,11 @@ }, { "cell_type": "markdown", - "id": "edf16008", + "id": "6f233bef", "metadata": {}, "source": [ + "\n", + "\n", "### Tool selection accuracy test\n", "\n", "We'll also create a custom test that evaluates the banking agent's ability to select the correct tools for different requests by:\n", @@ -1217,7 +1333,6 @@ }, { "cell_type": "markdown", - "id": "62e28f11", "metadata": {}, "source": [ "First, we'll define a helper function that extracts tool calls from the agent's messages and compares them against the expected tools. This function handles different message formats (dictionary or object) and calculates accuracy scores:" @@ -1226,7 +1341,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f8f18f1b", + "id": "e68798be", "metadata": {}, "outputs": [], "source": [ @@ -1263,7 +1378,6 @@ }, { "cell_type": "markdown", - "id": "7fb499c4", "metadata": {}, "source": [ "Now we'll define the main test function that uses the helper function to evaluate tool selection accuracy across all test cases in the dataset:" @@ -1272,7 +1386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d958ab5e", + "id": "604d7313", "metadata": {}, "outputs": [], "source": [ @@ -1327,7 +1441,6 @@ }, { "cell_type": "markdown", - "id": "48836abb", "metadata": {}, "source": [ "Finally, we can call our function with `run_test()` and log the test results to the ValidMind Platform:" @@ -1336,7 +1449,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2cc532d4", + "id": "dd14115e", "metadata": {}, "outputs": [], "source": [ @@ -1355,9 +1468,11 @@ }, { "cell_type": "markdown", - "id": "b455da6e", + "id": "f78f4107", "metadata": {}, "source": [ + "\n", + "\n", "## Assigning AI evaluation metric scores\n", "\n", "*AI agent evaluation metrics* are specialized measurements designed to assess how well autonomous LLM-based agents reason, plan, select and execute tools, and ultimately complete user tasks by analyzing the *full execution trace* — including reasoning steps, tool calls, intermediate decisions, and outcomes, rather than just single input–output pairs. These metrics are essential because agent failures often occur in ways traditional LLM metrics miss — for example, choosing the right tool with wrong arguments, creating a good plan but not following it, or completing a task inefficiently.\n", @@ -1369,9 +1484,10 @@ }, { "cell_type": "markdown", - "id": "c19a585e", "metadata": {}, "source": [ + "\n", + "\n", "### Identify relevant DeepEval scorers\n", "\n", "*Scorers* are evaluation metrics that analyze model outputs and store their results in the dataset:\n", @@ -1391,7 +1507,7 @@ { "cell_type": "code", "execution_count": null, - "id": "782f0e46", + "id": "730c70ec", "metadata": {}, "outputs": [], "source": [ @@ -1449,9 +1565,11 @@ }, { "cell_type": "markdown", - "id": "334a8456", + "id": "4dd73d0d", "metadata": {}, "source": [ + "\n", + "\n", "### Assign reasoning scores\n", "\n", "*Reasoning* evaluates planning and strategy generation:\n", @@ -1462,9 +1580,10 @@ }, { "cell_type": "markdown", - "id": "a35321f1", "metadata": {}, "source": [ + "\n", + "\n", "#### Plan quality score\n", "\n", "Let's measure how well our banking agent generates a plan before acting. A high score means the plan is logical, complete, and efficient." @@ -1473,7 +1592,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b37dfa9e", + "id": "52f362ba", "metadata": {}, "outputs": [], "source": [ @@ -1489,9 +1608,10 @@ }, { "cell_type": "markdown", - "id": "25af7e17", "metadata": {}, "source": [ + "\n", + "\n", "#### Plan adherence score\n", "\n", "Let's check whether our banking agent follows the plan it created. Deviations lower this score and indicate gaps between reasoning and execution." @@ -1500,7 +1620,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a75445dd", + "id": "4124a7c2", "metadata": {}, "outputs": [], "source": [ @@ -1518,9 +1638,11 @@ }, { "cell_type": "markdown", - "id": "df618f8c", + "id": "6da1ac95", "metadata": {}, "source": [ + "\n", + "\n", "### Assign action scores\n", "\n", "*Action* assesses tool usage and argument generation:\n", @@ -1531,9 +1653,10 @@ }, { "cell_type": "markdown", - "id": "c49015c1", "metadata": {}, "source": [ + "\n", + "\n", "#### Tool correctness score\n", "\n", "Let's evaluate if our banking agent selects the appropriate tool for the task. Choosing the wrong tool reduces performance even if reasoning was correct." @@ -1542,7 +1665,7 @@ { "cell_type": "code", "execution_count": null, - "id": "33be2dc8", + "id": "8d2e8a25", "metadata": {}, "outputs": [], "source": [ @@ -1560,9 +1683,10 @@ }, { "cell_type": "markdown", - "id": "4a103a62", "metadata": {}, "source": [ + "\n", + "\n", "#### Argument correctness score\n", "\n", "Let's assesses whether our banking agent provides correct inputs or arguments to the selected tool. Incorrect arguments can lead to failed or unexpected results." @@ -1571,7 +1695,7 @@ { "cell_type": "code", "execution_count": null, - "id": "860b318e", + "id": "04f90489", "metadata": {}, "outputs": [], "source": [ @@ -1588,9 +1712,11 @@ }, { "cell_type": "markdown", - "id": "3235d4ec", + "id": "c59e5595", "metadata": {}, "source": [ + "\n", + "\n", "### Assign execution scores\n", "\n", "*Execution* measures end-to-end performance:\n", @@ -1601,9 +1727,10 @@ }, { "cell_type": "markdown", - "id": "9ea98971", "metadata": {}, "source": [ + "\n", + "\n", "#### Task completion score\n", "\n", "Let's evaluate whether our banking agent successfully completes the requested tasks. Incomplete task execution can lead to user dissatisfaction and failed banking operations." @@ -1612,7 +1739,7 @@ { "cell_type": "code", "execution_count": null, - "id": "48ac405a", + "id": "05024f1f", "metadata": {}, "outputs": [], "source": [ @@ -1629,7 +1756,6 @@ }, { "cell_type": "markdown", - "id": "c33c568a", "metadata": {}, "source": [ "As you recall from the beginning of this section, when we run scorers through `assign_scores()`, the return values are automatically processed and added as new columns with the format `{scorer_name}_{metric_name}`. Note that the task completion scorer has added a new column `TaskCompletion_score` to our dataset.\n", @@ -1640,7 +1766,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5f27f3c1", + "id": "7f6d08ca", "metadata": {}, "outputs": [], "source": [ @@ -1658,9 +1784,10 @@ }, { "cell_type": "markdown", - "id": "19b9c40a", "metadata": {}, "source": [ + "\n", + "\n", "#### Step efficiency score\n", "\n", "Let's evaluate whether our banking agent avoids unnecessary or redundant steps during task execution. Inefficient step sequences can lead to increased latency, higher costs, and poor user experience." @@ -1669,7 +1796,7 @@ { "cell_type": "code", "execution_count": null, - "id": "36f8fcf5", + "id": "aa6e154a", "metadata": {}, "outputs": [], "source": [ @@ -1686,9 +1813,11 @@ }, { "cell_type": "markdown", - "id": "8d80886e", + "id": "012bbcb8", "metadata": {}, "source": [ + "\n", + "\n", "## Running RAGAS tests\n", "\n", "Next, let's run some out-of-the-box *Retrieval-Augmented Generation Assessment* (RAGAS) tests available in the ValidMind Library. RAGAS provides specialized metrics for evaluating retrieval-augmented generation systems and conversational AI agents. These metrics analyze different aspects of agent performance by assessing how well systems integrate retrieved information with generated responses.\n", @@ -1700,9 +1829,10 @@ }, { "cell_type": "markdown", - "id": "f27423e8", "metadata": {}, "source": [ + "\n", + "\n", "### Identify relevant RAGAS tests\n", "\n", "Let's explore some of ValidMind's available tests. Using ValidMind’s repository of tests streamlines your development testing, and helps you ensure that your models are being documented and evaluated appropriately.\n", @@ -1718,7 +1848,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dedcaca5", + "id": "0701f5a9", "metadata": {}, "outputs": [], "source": [ @@ -1727,9 +1857,10 @@ }, { "cell_type": "markdown", - "id": "6f2b5067", "metadata": {}, "source": [ + "\n", + "\n", "#### Faithfulness\n", "\n", "Let's evaluate whether the banking agent's responses accurately reflect the information retrieved from tools. Unfaithful responses can misreport credit analysis, financial calculations, and compliance results—undermining user trust in the banking agent." @@ -1738,7 +1869,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f32fa6ef", + "id": "92044533", "metadata": {}, "outputs": [], "source": [ @@ -1755,9 +1886,10 @@ }, { "cell_type": "markdown", - "id": "fe65be0a", "metadata": {}, "source": [ + "\n", + "\n", "#### Response Relevancy\n", "\n", "Let's evaluate whether the banking agent's answers address the user's original question or request. Irrelevant or off-topic responses can frustrate users and fail to deliver the banking information they need." @@ -1766,7 +1898,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8f82d1db", + "id": "d7483bc3", "metadata": {}, "outputs": [], "source": [ @@ -1783,9 +1915,10 @@ }, { "cell_type": "markdown", - "id": "9d2cb0b1", "metadata": {}, "source": [ + "\n", + "\n", "#### Context Recall\n", "\n", "Let's evaluate how well the banking agent uses the information retrieved from tools when generating its responses. Poor context recall can lead to incomplete or underinformed answers even when the right tools were selected." @@ -1794,7 +1927,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1f6ff7a4", + "id": "e5dc00ce", "metadata": {}, "outputs": [], "source": [ @@ -1811,9 +1944,11 @@ }, { "cell_type": "markdown", - "id": "4508379e", + "id": "b987b00e", "metadata": {}, "source": [ + "\n", + "\n", "## Running safety tests\n", "\n", "Finally, let's run some out-of-the-box *safety* tests available in the ValidMind Library. Safety tests provide specialized metrics for evaluating whether AI agents operate reliably and securely. These metrics analyze different aspects of agent behavior by assessing adherence to safety guidelines, consistency of outputs, and resistance to harmful or inappropriate requests.\n", @@ -1825,9 +1960,10 @@ }, { "cell_type": "markdown", - "id": "44efe7f0", "metadata": {}, "source": [ + "\n", + "\n", "#### AspectCritic\n", "\n", "Let's evaluate our banking agent's responses across multiple quality dimensions — conciseness, coherence, correctness, harmfulness, and maliciousness. Weak performance on these dimensions can degrade user experience, fall short of professional banking standards, or introduce safety risks. \n", @@ -1838,7 +1974,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c14b4fb5", + "id": "148daa2b", "metadata": {}, "outputs": [], "source": [ @@ -1855,9 +1991,10 @@ }, { "cell_type": "markdown", - "id": "6ca43e1b", "metadata": {}, "source": [ + "\n", + "\n", "#### Bias\n", "\n", "Let's evaluate whether our banking agent's prompts contain unintended biases that could affect banking decisions. Biased prompts can lead to unfair or discriminatory outcomes — undermining customer trust and exposing the institution to compliance risk.\n", @@ -1868,7 +2005,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2f0050da", + "id": "74eba86c", "metadata": {}, "outputs": [], "source": [ @@ -1877,7 +2014,6 @@ }, { "cell_type": "markdown", - "id": "5a0cfc84", "metadata": {}, "source": [ "And then run the identified `Bias` test:" @@ -1886,7 +2022,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f612c6b2", + "id": "062cf8e7", "metadata": {}, "outputs": [], "source": [ @@ -1900,9 +2036,11 @@ }, { "cell_type": "markdown", - "id": "next-steps-24e25294-3e59-4982-92d0-a998cfbe3bb5", + "id": "a2832750", "metadata": {}, "source": [ + "\n", + "\n", "## Next steps\n", "\n", "You can look at the output produced by the ValidMind Library right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation." @@ -1910,9 +2048,11 @@ }, { "cell_type": "markdown", - "id": "next-docs-7773f25a-ef1c-40d1-9b4b-f6cecdf7dc6c", + "id": "a8cb1a58", "metadata": {}, "source": [ + "\n", + "\n", "### Work with your model documentation\n", "\n", "1. From the **Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", @@ -1926,9 +2066,11 @@ }, { "cell_type": "markdown", - "id": "002a6b1d", + "id": "94ef26be", "metadata": {}, "source": [ + "\n", + "\n", "### Customize the banking agent for your use case\n", "\n", "You've now built an agentic AI system designed for banking use cases that supports compliance with supervisory guidance such as SR 11-7 and SS1/23, covering credit and fraud risk assessment for both retail and commercial banking. Extend this example agent to real-world banking scenarios and production deployment by:\n", @@ -1941,9 +2083,11 @@ }, { "cell_type": "markdown", - "id": "next-resources-779f3903-eb13-4242-9438-e0dfc1753d4d", + "id": "a681e49c", "metadata": {}, "source": [ + "\n", + "\n", "### Discover more learning resources\n", "\n", "Learn more about the ValidMind Library tools we used in this notebook:\n", @@ -1962,9 +2106,10 @@ }, { "cell_type": "markdown", - "id": "upgrade-vm-362dbd51-d699-405b-b817-f95381861976", "metadata": {}, "source": [ + "\n", + "\n", "## Upgrade ValidMind\n", "\n", "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", @@ -1975,7 +2120,7 @@ { "cell_type": "code", "execution_count": null, - "id": "upgrade-show-2ce32ad2-2751-4b40-a1ec-161b178603b1", + "id": "9733adff", "metadata": {}, "outputs": [], "source": [ @@ -1984,7 +2129,7 @@ }, { "cell_type": "markdown", - "id": "upgrade-version-61359c61-7218-4a90-9f6f-70794c42d997", + "id": "e4b0b646", "metadata": {}, "source": [ "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", @@ -1996,7 +2141,7 @@ }, { "cell_type": "markdown", - "id": "upgrade-restart-c45548b7-aee3-4b96-b412-37c781071787", + "id": "387fa7f1", "metadata": {}, "source": [ "You may need to restart your kernel after running the upgrade package for changes to be applied." @@ -2004,7 +2149,6 @@ }, { "cell_type": "markdown", - "id": "copyright-99bac407-03cd-4fe3-83c8-0a280d4caa64", "metadata": {}, "source": [ "\n", diff --git a/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb b/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb deleted file mode 100644 index 9afebb2e6..000000000 --- a/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb +++ /dev/null @@ -1,1501 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# AI Agent Validation with ValidMind - Banking Demo\n", - "\n", - "This notebook shows how to document and evaluate an agentic AI system with the ValidMind Library. Using a small banking agent built in LangGraph as an example, you will run ValidMind’s built-in and custom tests and produce the artifacts needed to create evidence-backed documentation.\n", - "\n", - "An AI agent is an autonomous system that interprets inputs, selects from available tools or actions, and carries out multi-step behaviors to achieve user goals. In this example, our agent acts as a professional banking assistant that analyzes user requests and automatically selects and invokes the most appropriate specialized banking tool (credit, account, or fraud) to deliver accurate, compliant, and actionable responses.\n", - "\n", - "However, agentic capabilities bring concrete risks. The agent may misinterpret user inputs or fail to extract required parameters, producing incorrect credit assessments or inappropriate account actions; it can select the wrong tool (for example, invoking account management instead of fraud detection), which may cause unsafe, non-compliant, or customer-impacting behaviour.\n", - "\n", - "This interactive notebook guides you step-by-step through building a demo LangGraph banking agent, preparing an evaluation dataset, initializing the ValidMind Library and required objects, writing custom tests for tool-selection accuracy and entity extraction, running ValidMind’s built-in and custom test suites, and logging documentation artifacts to ValidMind." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - "- [About ValidMind](#toc1__) \n", - " - [Before you begin](#toc1_1__) \n", - " - [New to ValidMind?](#toc1_2__) \n", - " - [Key concepts](#toc1_3__) \n", - "- [Setting up](#toc2__) \n", - " - [Install the ValidMind Library](#toc2_1__) \n", - " - [Initialize the ValidMind Library](#toc2_2__) \n", - " - [Register sample model](#toc2_2_1__) \n", - " - [Apply documentation template](#toc2_2_2__) \n", - " - [Get your code snippet](#toc2_2_3__) \n", - " - [Initialize the Python environment](#toc2_3__) \n", - "- [Banking Tools](#toc3__) \n", - " - [Tool Overview](#toc3_1__) \n", - " - [Test Banking Tools Individually](#toc3_2__) \n", - "- [Complete LangGraph Banking Agent](#toc4__) \n", - "- [ValidMind Model Integration](#toc5__) \n", - "- [Prompt Validation](#toc6__) \n", - "- [Banking Test Dataset](#toc7__) \n", - " - [Initialize ValidMind Dataset](#toc7_1__) \n", - " - [Run the Agent and capture result through assign predictions](#toc7_2__) \n", - " - [Dataframe Display Settings](#toc7_2_1__) \n", - "- [Banking Accuracy Test](#toc8__) \n", - "- [Banking Tool Call Accuracy Test](#toc9__) \n", - "- [Scorers in ValidMind](#toc10__)\n", - " - [Plan Quality Metric scorer](#toc10_1) \n", - " - [Plan Adherence Metric scorer](#toc10_2) \n", - " - [Tool Correctness Metric scorer](#toc10_3) \n", - " - [Argument Correctness Metric scorer](#toc10_4) \n", - " - [Task Completion scorer](#toc10_5) \n", - "- [RAGAS Tests for an Agent Evaluation](#toc12__) \n", - " - [Faithfulness](#toc12_1__) \n", - " - [Response Relevancy](#toc12_2__) \n", - " - [Context Recall](#toc12_3__) \n", - "- [Safety](#toc13__) \n", - " - [AspectCritic](#toc13_1__) \n", - " - [Prompt bias](#toc13_2__) \n", - " - [Toxicity](#toc13_3__) \n", - "- [Demo Summary and Next Steps](#toc14__) \n", - " - [What We Built](#toc14_1__) \n", - " - [Next Steps](#toc14_2__) \n", - " - [Key Benefits](#toc14_3__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language.\n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Key concepts\n", - "\n", - "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", - "\n", - "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Install the ValidMind Library\n", - "\n", - "To install the library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q \"validmind[llm]\" " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Agentic AI System`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import validmind as vm\n", - "\n", - "vm.init(\n", - " api_host=\"...\",\n", - " api_key=\"...\",\n", - " api_secret=\"...\",\n", - " model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the Python environment\n", - "\n", - "Next, let's import all the necessary libraries for building our banking LangGraph agent system:\n", - "\n", - "- **LangChain components** for LLM integration and tool management\n", - "- **LangGraph** for building stateful, multi-step agent workflows\n", - "- **ValidMind** for model validation and testing\n", - "- **Banking tools** for specialized financial services\n", - "- **Standard libraries** for data handling and environment management\n", - "\n", - "The setup includes loading environment variables (like OpenAI API keys) needed for the LLM components to function properly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Standard library imports\n", - "from typing import TypedDict, Annotated, Sequence\n", - "\n", - "# Third party imports\n", - "import pandas as pd\n", - "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", - "from langchain_openai import ChatOpenAI\n", - "from langgraph.checkpoint.memory import MemorySaver\n", - "from langgraph.graph import StateGraph, END, START\n", - "from langgraph.graph.message import add_messages\n", - "from langgraph.prebuilt import ToolNode\n", - "\n", - "# Local imports\n", - "from banking_tools import AVAILABLE_TOOLS\n", - "from validmind.tests import run_test\n", - "\n", - "pd.set_option('display.max_columns', None)\n", - "pd.set_option('display.max_colwidth', None)\n", - "pd.set_option('display.width', None)\n", - "pd.set_option('display.max_rows', None)\n", - "\n", - "# Load environment variables if using .env file\n", - "try:\n", - " from dotenv import load_dotenv\n", - " load_dotenv()\n", - "except ImportError:\n", - " print(\"dotenv not installed. Make sure OPENAI_API_KEY is set in your environment.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Banking Tools\n", - "\n", - "Now let's use the following banking demo tools that provide use cases of the financial services:\n", - "\n", - "\n", - "\n", - "### Tool Overview\n", - "1. **Credit Risk Analyzer** - Loan applications and credit decisions\n", - "2. **Customer Account Manager** - Account services and customer support\n", - "3. **Fraud Detection System** - Security and fraud prevention" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Available tools: {len(AVAILABLE_TOOLS)}\")\n", - "print(\"\\nTool Details:\")\n", - "for i, tool in enumerate(AVAILABLE_TOOLS, 1):\n", - " print(f\" - {tool.name}\") " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Test Banking Tools Individually\n", - "\n", - "Let's test each banking tool individually to ensure they're working correctly before integrating them into our agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Testing Individual Banking Tools\")\n", - "print(\"=\" * 60)\n", - "\n", - "# Test 1: Credit Risk Analyzer\n", - "print(\"TEST 1: Credit Risk Analyzer\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " # Access the underlying function using .func\n", - " credit_result = AVAILABLE_TOOLS[0].func(\n", - " customer_income=75000,\n", - " customer_debt=1200,\n", - " credit_score=720,\n", - " loan_amount=50000,\n", - " loan_type=\"personal\"\n", - " )\n", - " print(credit_result)\n", - " print(\"Credit Risk Analyzer test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Credit Risk Analyzer test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)\n", - "\n", - "# Test 2: Customer Account Manager\n", - "print(\"TEST 2: Customer Account Manager\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " # Test checking balance\n", - " account_result = AVAILABLE_TOOLS[1].func(\n", - " account_type=\"checking\",\n", - " customer_id=\"12345\",\n", - " action=\"check_balance\"\n", - " )\n", - " print(account_result)\n", - " \n", - " # Test getting account info\n", - " info_result = AVAILABLE_TOOLS[1].func(\n", - " account_type=\"all\",\n", - " customer_id=\"12345\", \n", - " action=\"get_info\"\n", - " )\n", - " print(info_result)\n", - " print(\"Customer Account Manager test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Customer Account Manager test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)\n", - "\n", - "# Test 3: Fraud Detection System\n", - "print(\"TEST 3: Fraud Detection System\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " fraud_result = AVAILABLE_TOOLS[2].func(\n", - " transaction_id=\"TX123\",\n", - " customer_id=\"12345\",\n", - " transaction_amount=500.00,\n", - " transaction_type=\"withdrawal\",\n", - " location=\"Miami, FL\",\n", - " device_id=\"DEVICE_001\"\n", - " )\n", - " print(fraud_result)\n", - " print(\"Fraud Detection System test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Fraud Detection System test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Complete LangGraph Banking Agent\n", - "\n", - "Now we'll create our intelligent banking agent with LangGraph that can automatically select and use the appropriate banking tools based on user requests." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Enhanced banking system prompt with tool selection guidance\n", - "system_context = \"\"\"You are a professional banking AI assistant with access to specialized banking tools.\n", - " Analyze the user's banking request and directly use the most appropriate tools to help them.\n", - " \n", - " AVAILABLE BANKING TOOLS:\n", - " \n", - " credit_risk_analyzer - Analyze credit risk for loan applications and credit decisions\n", - " - Use for: loan applications, credit assessments, risk analysis, mortgage eligibility\n", - " - Examples: \"Analyze credit risk for $50k personal loan\", \"Assess mortgage eligibility for $300k home purchase\"\n", - " - Parameters: customer_income, customer_debt, credit_score, loan_amount, loan_type\n", - "\n", - " customer_account_manager - Manage customer accounts and provide banking services\n", - " - Use for: account information, transaction processing, product recommendations, customer service\n", - " - Examples: \"Check balance for checking account 12345\", \"Recommend products for customer with high balance\"\n", - " - Parameters: account_type, customer_id, action, amount, account_details\n", - "\n", - " fraud_detection_system - Analyze transactions for potential fraud and security risks\n", - " - Use for: transaction monitoring, fraud prevention, risk assessment, security alerts\n", - " - Examples: \"Analyze fraud risk for $500 ATM withdrawal in Miami\", \"Check security for $2000 online purchase\"\n", - " - Parameters: transaction_id, customer_id, transaction_amount, transaction_type, location, device_id\n", - "\n", - " BANKING INSTRUCTIONS:\n", - " - Analyze the user's banking request carefully and identify the primary need\n", - " - If they need credit analysis → use credit_risk_analyzer\n", - " - If they need financial calculations → use financial_calculator\n", - " - If they need account services → use customer_account_manager\n", - " - If they need security analysis → use fraud_detection_system\n", - " - Extract relevant parameters from the user's request\n", - " - Provide helpful, accurate banking responses based on tool outputs\n", - " - Always consider banking regulations, risk management, and best practices\n", - " - Be professional and thorough in your analysis\n", - "\n", - " Choose and use tools wisely to provide the most helpful banking assistance.\n", - " \"\"\"\n", - "# Initialize the main LLM for banking responses\n", - "main_llm = ChatOpenAI(\n", - " model=\"gpt-5-mini\",\n", - " reasoning={\n", - " \"effort\": \"low\",\n", - " \"summary\": \"auto\"\n", - " }\n", - ")\n", - "# Bind all banking tools to the main LLM\n", - "llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)\n", - "\n", - "# Banking Agent State Definition\n", - "class BankingAgentState(TypedDict):\n", - " messages: Annotated[Sequence[BaseMessage], add_messages]\n", - " user_input: str\n", - " session_id: str\n", - " context: dict\n", - "\n", - "def create_banking_langgraph_agent():\n", - " \"\"\"Create a comprehensive LangGraph banking agent with intelligent tool selection.\"\"\"\n", - " def llm_node(state: BankingAgentState) -> BankingAgentState:\n", - " \"\"\"Main LLM node that processes banking requests and selects appropriate tools.\"\"\"\n", - " messages = state[\"messages\"]\n", - " # Add system context to messages\n", - " enhanced_messages = [SystemMessage(content=system_context)] + list(messages)\n", - " # Get LLM response with tool selection\n", - " response = llm_with_tools.invoke(enhanced_messages)\n", - " return {\n", - " **state,\n", - " \"messages\": messages + [response]\n", - " }\n", - " \n", - " def should_continue(state: BankingAgentState) -> str:\n", - " \"\"\"Decide whether to use tools or end the conversation.\"\"\"\n", - " last_message = state[\"messages\"][-1]\n", - " # Check if the LLM wants to use tools\n", - " if hasattr(last_message, 'tool_calls') and last_message.tool_calls:\n", - " return \"tools\"\n", - " return END\n", - " \n", - " # Create the banking state graph\n", - " workflow = StateGraph(BankingAgentState)\n", - " # Add nodes\n", - " workflow.add_node(\"llm\", llm_node)\n", - " workflow.add_node(\"tools\", ToolNode(AVAILABLE_TOOLS))\n", - " # Simplified entry point - go directly to LLM\n", - " workflow.add_edge(START, \"llm\")\n", - " # From LLM, decide whether to use tools or end\n", - " workflow.add_conditional_edges(\n", - " \"llm\",\n", - " should_continue,\n", - " {\"tools\": \"tools\", END: END}\n", - " )\n", - " # Tool execution flows back to LLM for final response\n", - " workflow.add_edge(\"tools\", \"llm\")\n", - " # Set up memory\n", - " memory = MemorySaver()\n", - " # Compile the graph\n", - " agent = workflow.compile(checkpointer=memory)\n", - " return agent\n", - "\n", - "# Create the banking intelligent agent\n", - "banking_agent = create_banking_langgraph_agent()\n", - "\n", - "print(\"Banking LangGraph Agent Created Successfully!\")\n", - "print(\"\\nFeatures:\")\n", - "print(\" - Intelligent banking tool selection\")\n", - "print(\" - Comprehensive banking system prompt\")\n", - "print(\" - Streamlined workflow: LLM → Tools → Response\")\n", - "print(\" - Automatic tool parameter extraction\")\n", - "print(\" - Professional banking assistance\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## ValidMind Model Integration\n", - "\n", - "Now we'll integrate our banking LangGraph agent with ValidMind for comprehensive testing and validation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.models import Prompt\n", - "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list\n", - "def banking_agent_fn(input):\n", - " \"\"\"\n", - " Invoke the banking agent with the given input.\n", - " \"\"\"\n", - " try:\n", - " # Initial state for banking agent\n", - " initial_state = {\n", - " \"user_input\": input[\"input\"],\n", - " \"messages\": [HumanMessage(content=input[\"input\"])],\n", - " \"session_id\": input[\"session_id\"],\n", - " \"context\": {}\n", - " }\n", - " session_config = {\"configurable\": {\"thread_id\": input[\"session_id\"]}}\n", - " result = banking_agent.invoke(initial_state, config=session_config)\n", - "\n", - " from utils import capture_tool_output_messages\n", - "\n", - " # Capture all tool outputs and metadata\n", - " captured_data = capture_tool_output_messages(result)\n", - " \n", - " # Access specific tool outputs, this will be used for RAGAS tests\n", - " tool_message = \"\"\n", - " for output in captured_data[\"tool_outputs\"]:\n", - " tool_message += output['content']\n", - " \n", - " tool_calls_found = []\n", - " messages = result['messages']\n", - " for message in messages:\n", - " if hasattr(message, 'tool_calls') and message.tool_calls:\n", - " for tool_call in message.tool_calls:\n", - " # Handle both dictionary and object formats\n", - " if isinstance(tool_call, dict):\n", - " tool_calls_found.append(tool_call['name'])\n", - " else:\n", - " # ToolCall object - use attribute access\n", - " tool_calls_found.append(tool_call.name)\n", - "\n", - "\n", - " return {\n", - " \"prediction\": result['messages'][-1].content[0]['text'],\n", - " \"output\": result,\n", - " \"tool_messages\": [tool_message],\n", - " # \"tool_calls\": tool_calls_found,\n", - " \"tool_called\": _convert_to_tool_call_list(extract_tool_calls_from_agent_output(result))\n", - " }\n", - " except Exception as e:\n", - " # Return a fallback response if the agent fails\n", - " error_message = f\"\"\"I apologize, but I encountered an error while processing your banking request: {str(e)}.\n", - " Please try rephrasing your question or contact support if the issue persists.\"\"\"\n", - " return {\n", - " \"prediction\": error_message, \n", - " \"output\": {\n", - " \"messages\": [HumanMessage(content=input[\"input\"]), SystemMessage(content=error_message)],\n", - " \"error\": str(e)\n", - " }\n", - " }\n", - "\n", - "## Initialize the model\n", - "vm_banking_model = vm.init_model(\n", - " input_id=\"banking_agent_model\",\n", - " predict_fn=banking_agent_fn,\n", - " prompt=Prompt(template=system_context)\n", - ")\n", - "\n", - "# Add the banking agent to the vm model\n", - "vm_banking_model.model = banking_agent\n", - "\n", - "print(\"Banking Agent Successfully Integrated with ValidMind!\")\n", - "print(f\"Model ID: {vm_banking_model.input_id}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Prompt Validation\n", - "\n", - "Let's get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", - "\n", - "- **Clarity**: How clearly the prompt states the task.\n", - "- **Conciseness**: How succinctly the prompt states the task.\n", - "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", - "- **NegativeInstruction**: Whether the prompt contains negative instructions.\n", - "- **Specificity**: How specific the prompt defines the task." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Clarity\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Conciseness\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Delimitation\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.NegativeInstruction\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Specificity\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Banking Test Dataset\n", - "\n", - "We'll use a sample test dataset to evaluate our agent's performance across different banking scenarios.\n", - "\n", - "\n", - "\n", - "### Initialize ValidMind Dataset\n", - "\n", - "Before we can run tests and evaluations, we need to initialize our banking test dataset as a ValidMind dataset object." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Import our banking-specific test dataset\n", - "from banking_test_dataset import banking_test_dataset\n", - "\n", - "vm_test_dataset = vm.init_dataset(\n", - " input_id=\"banking_test_dataset\",\n", - " dataset=banking_test_dataset.sample(2),\n", - " text_column=\"input\",\n", - " target_column=\"possible_outputs\",\n", - ")\n", - "\n", - "print(\"Banking Test Dataset Initialized in ValidMind!\")\n", - "print(f\"Dataset ID: {vm_test_dataset.input_id}\")\n", - "print(f\"Dataset columns: {vm_test_dataset._df.columns}\")\n", - "vm_test_dataset._df.head(1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Run the Agent and capture result through assign predictions\n", - "\n", - "Now we'll execute our banking agent on the test dataset and capture its responses for evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_predictions(vm_banking_model)\n", - "\n", - "print(\"Banking Agent Predictions Generated Successfully!\")\n", - "print(f\"Predictions assigned to {len(vm_test_dataset._df)} test cases\")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Banking Accuracy Test\n", - "\n", - "This test evaluates the banking agent's ability to provide accurate responses by:\n", - "- Testing against a dataset of predefined banking questions and expected answers\n", - "- Checking if responses contain expected keywords and banking terminology\n", - "- Providing detailed test results including pass/fail status\n", - "- Helping identify any gaps in the agent's banking knowledge or response quality" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "@vm.test(\"my_custom_tests.banking_accuracy_test\")\n", - "def banking_accuracy_test(model, dataset, list_of_columns):\n", - " \"\"\"\n", - " The Banking Accuracy Test evaluates whether the agent’s responses include \n", - " critical domain-specific keywords and phrases that indicate accurate, compliant,\n", - " and contextually appropriate banking information. This test ensures that the agent\n", - " provides responses containing the expected banking terminology, risk classifications,\n", - " account details, or other domain-relevant information required for regulatory compliance,\n", - " customer safety, and operational accuracy.\n", - " \"\"\"\n", - " df = dataset._df\n", - " \n", - " # Pre-compute responses for all tests\n", - " y_true = dataset.y.tolist()\n", - " y_pred = dataset.y_pred(model).tolist()\n", - "\n", - " # Vectorized test results\n", - " test_results = []\n", - " for response, keywords in zip(y_pred, y_true):\n", - " # Convert keywords to list if not already a list\n", - " if not isinstance(keywords, list):\n", - " keywords = [keywords]\n", - " test_results.append(any(str(keyword).lower() in str(response).lower() for keyword in keywords))\n", - " \n", - " results = pd.DataFrame()\n", - " column_names = [col + \"_details\" for col in list_of_columns]\n", - " results[column_names] = df[list_of_columns]\n", - " results[\"actual\"] = y_pred\n", - " results[\"expected\"] = y_true\n", - " results[\"passed\"] = test_results\n", - " results[\"error\"] = None if test_results else f'Response did not contain any expected keywords: {y_true}'\n", - " \n", - " return results\n", - " \n", - "result = run_test(\n", - " \"my_custom_tests.banking_accuracy_test\",\n", - " inputs={\n", - " \"dataset\": vm_test_dataset,\n", - " \"model\": vm_banking_model\n", - " },\n", - " params={\n", - " \"list_of_columns\": [\"input\"]\n", - " }\n", - ")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Banking Tool Call Accuracy Test\n", - "\n", - "This test evaluates how accurately our intelligent banking router selects the correct tools for different banking requests. This test provides quantitative feedback on the agent's core intelligence - its ability to understand what users need and select the right banking tools to help them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_custom_tests.BankingToolCallAccuracy\")\n", - "def BankingToolCallAccuracy(dataset, agent_output_column, expected_tools_column):\n", - " \"\"\"\n", - " Evaluates the tool selection accuracy of a LangGraph-powered banking agent.\n", - "\n", - " This test measures whether the agent correctly identifies and invokes the required banking tools\n", - " for each user query scenario.\n", - " For each case, the outputs generated by the agent (including its tool calls) are compared against an\n", - " expected set of tools. The test considers both coverage and exactness: it computes the proportion of\n", - " expected tools correctly called by the agent for each instance.\n", - "\n", - " Parameters:\n", - " dataset (VMDataset): The dataset containing user queries, agent outputs, and ground-truth tool expectations.\n", - " agent_output_column (str): Dataset column name containing agent outputs (should include tool call details in 'messages').\n", - " expected_tools_column (str): Dataset column specifying the true expected tools (as lists).\n", - "\n", - " Returns:\n", - " List[dict]: Per-row dictionaries with details: expected tools, found tools, match count, total expected, and accuracy score.\n", - "\n", - " Purpose:\n", - " Provides diagnostic evidence of the banking agent's core reasoning ability—specifically, its capacity to\n", - " interpret user needs and select the correct banking actions. Useful for diagnosing gaps in tool coverage,\n", - " misclassifications, or breakdowns in agent logic.\n", - "\n", - " Interpretation:\n", - " - An accuracy of 1.0 signals perfect tool selection for that example.\n", - " - Lower scores may indicate partial or complete failures to invoke required tools.\n", - " - Review 'found_tools' vs. 'expected_tools' to understand the source of discrepancies.\n", - "\n", - " Strengths:\n", - " - Directly tests a core capability of compositional tool-use agents.\n", - " - Framework-agnostic; robust to tool call output format (object or dict).\n", - " - Supports batch validation and result logging for systematic documentation.\n", - "\n", - " Limitations:\n", - " - Does not penalize extra, unnecessary tool calls.\n", - " - Does not assess result quality—only correct invocation.\n", - "\n", - " \"\"\"\n", - " def validate_tool_calls_simple(messages, expected_tools):\n", - " \"\"\"Simple validation of tool calls without RAGAS dependency issues.\"\"\"\n", - " \n", - " tool_calls_found = []\n", - " \n", - " for message in messages:\n", - " if hasattr(message, 'tool_calls') and message.tool_calls:\n", - " for tool_call in message.tool_calls:\n", - " # Handle both dictionary and object formats\n", - " if isinstance(tool_call, dict):\n", - " tool_calls_found.append(tool_call['name'])\n", - " else:\n", - " # ToolCall object - use attribute access\n", - " tool_calls_found.append(tool_call.name)\n", - " \n", - " # Check if expected tools were called\n", - " accuracy = 0.0\n", - " matches = 0\n", - " if expected_tools:\n", - " matches = sum(1 for tool in expected_tools if tool in tool_calls_found)\n", - " accuracy = matches / len(expected_tools)\n", - " \n", - " return {\n", - " 'expected_tools': expected_tools,\n", - " 'found_tools': tool_calls_found,\n", - " 'matches': matches,\n", - " 'total_expected': len(expected_tools) if expected_tools else 0,\n", - " 'accuracy': accuracy,\n", - " }\n", - "\n", - " df = dataset._df\n", - " \n", - " results = []\n", - " for i, row in df.iterrows():\n", - " result = validate_tool_calls_simple(row[agent_output_column]['messages'], row[expected_tools_column])\n", - " results.append(result)\n", - " \n", - " return results\n", - "\n", - "run_test(\n", - " \"my_custom_tests.BankingToolCallAccuracy\",\n", - " inputs = {\n", - " \"dataset\": vm_test_dataset,\n", - " },\n", - " params = {\n", - " \"agent_output_column\": \"banking_agent_model_output\",\n", - " \"expected_tools_column\": \"expected_tools\"\n", - " }\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Scorers in ValidMind\n", - "\n", - "Scorers are evaluation metrics that analyze model outputs and store their results in the dataset. When using `assign_scores()`:\n", - "\n", - "- Each scorer adds a new column to the dataset with format: {scorer_name}_{metric_name}\n", - "- The column contains the numeric score (typically 0-1) for each example\n", - "- Multiple scorers can be run on the same dataset, each adding their own column\n", - "- Scores are persisted in the dataset for later analysis and visualization\n", - "- Common scorer patterns include:\n", - " - Model performance metrics (accuracy, F1, etc)\n", - " - Output quality metrics (relevance, faithfulness)\n", - " - Task-specific metrics (completion, correctness)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### AI Agent Evaluation Metrics\n", - "\n", - "AI agent evaluation metrics are specialized measurements designed to assess how well autonomous LLM-based agents reason, plan, select and execute tools, and ultimately complete user tasks by analyzing the **full execution trace**—including reasoning steps, tool calls, intermediate decisions, and outcomes—rather than just single input–output pairs.\n", - "\n", - "These metrics are essential because agent failures often occur in ways traditional LLM metrics miss (e.g., choosing the right tool with wrong arguments, creating a good plan but not following it, or completing a task inefficiently).\n", - "\n", - "**DeepEval’s AI agent evaluation framework** breaks evaluation into three layers with corresponding metric categories:\n", - "\n", - "1. **Reasoning Layer** – Evaluates planning and strategy generation:\n", - "\n", - " * *PlanQualityMetric* – how logical, complete, and efficient the agent’s plan is\n", - " * *PlanAdherenceMetric* – whether the agent follows its own plan during execution \n", - "\n", - "2. **Action Layer** – Assesses tool usage and argument generation:\n", - "\n", - " * *ToolCorrectnessMetric* – whether the agent selects and calls the right tools\n", - " * *ArgumentCorrectnessMetric* – whether the agent generates correct tool arguments\n", - "\n", - "3. **Execution Layer** – Measures end-to-end performance:\n", - "\n", - " * *TaskCompletionMetric* – whether the agent successfully completes the intended task\n", - " * *StepEfficiencyMetric* – whether the agent avoids unnecessary or redundant steps\n", - "\n", - "Together, these metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### **Reasoning Layer**\n", - "#### PlanQualityMetric\n", - "Let's measures how well the agent generates a plan before acting. A high score means the plan is logical, complete, and efficient." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.PlanQuality\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### PlanAdherenceMetric\n", - "Let's checks whether the agent follows the plan it created. Deviations lower this score and indicate gaps between reasoning and execution." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.PlanAdherence\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " expected_output_column = \"expected_output\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### **Action Layer**\n", - "#### ToolCorrectnessMetric\n", - "Let's evaluates if the agent selects the appropriate tool for the task. Choosing the wrong tool reduces performance even if reasoning was correct." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.ToolCorrectness\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - " expected_tools_column = \"expected_tools\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### ArgumentCorrectnessMetric\n", - "Let's assesses whether the agent provides correct inputs or arguments to the selected tool. Incorrect arguments can lead to failed or unexpected results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.ArgumentCorrectness\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### **Execution Layer**\n", - "#### TaskCompletionMetric\n", - "The TaskCompletion test evaluates whether our banking agent successfully completes the requested tasks by analyzing its outputs and tool usage. This metric assesses the agent's ability to understand user requests, execute appropriate actions, and provide complete responses that address the original query. The test provides a score between 0-1 along with detailed feedback on task completion quality." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.TaskCompletion\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The TaskCompletion scorer has added a new column 'TaskCompletion_score' to our dataset. This is because when we run scorers through assign_scores(), the return values are automatically processed and added as new columns with the format {scorer_name}_{metric_name}. We'll use this column to visualize the distribution of task completion scores across our test cases. Let's visualize the distribution through the box plot test." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.plots.BoxPlot\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " params={\n", - " \"columns\": \"TaskCompletion_score\",\n", - " \"title\": \"Distribution of Task Completion Scores\",\n", - " \"ylabel\": \"Score\",\n", - " \"figsize\": (8, 6)\n", - " }\n", - ").log()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## RAGAS Tests for an Agent Evaluation\n", - "\n", - "RAGAS (Retrieval-Augmented Generation Assessment) provides specialized metrics for evaluating conversational AI systems like our banking agent. These tests analyze different aspects of agent performance:\n", - "\n", - "Our banking agent uses tools to retrieve information and generates responses based on that context, making it similar to a RAG system. RAGAS metrics help evaluate:\n", - "\n", - "- **Response Quality**: How well the agent uses retrieved tool outputs to generate helpful banking responses\n", - "- **Information Faithfulness**: Whether agent responses accurately reflect tool outputs \n", - "- **Relevance Assessment**: How well responses address the original banking query\n", - "- **Context Utilization**: How effectively the agent incorporates tool results into final answers\n", - "\n", - "These tests provide insights into how well our banking agent integrates tool usage with conversational abilities, ensuring it provides accurate, relevant, and helpful responses to banking users." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Faithfulness\n", - "\n", - "Faithfulness measures how accurately the banking agent's responses reflect the information retrieved from tools. This metric evaluates:\n", - "\n", - "**Information Accuracy**: Whether the agent correctly uses tool outputs in its responses\n", - "- **Fact Preservation**: Ensuring credit scores, loan calculations, compliance results are accurately reported\n", - "- **No Hallucination**: Verifying the agent doesn't invent banking information not provided by tools\n", - "- **Source Attribution**: Checking that responses align with actual tool outputs\n", - "\n", - "**Critical for Banking Trust**: Faithfulness is essential for banking agent reliability because users need to trust that:\n", - "- Credit analysis results are reported correctly\n", - "- Financial calculations are accurate \n", - "- Compliance checks return real information\n", - "- Risk assessments are properly communicated" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.Faithfulness\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " param_grid={\n", - " \"user_input_column\": [\"input\"],\n", - " \"response_column\": [\"banking_agent_model_prediction\"],\n", - " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Response Relevancy\n", - "\n", - "Response Relevancy evaluates how well the banking agent's answers address the user's original banking question or request. This metric assesses:\n", - "\n", - "**Query Alignment**: Whether responses directly answer what users asked for\n", - "- **Intent Fulfillment**: Checking if the agent understood and addressed the user's actual banking need\n", - "- **Completeness**: Ensuring responses provide sufficient information to satisfy the banking query\n", - "- **Focus**: Avoiding irrelevant information that doesn't help the banking user\n", - "\n", - "**Banking Quality**: Measures the agent's ability to maintain relevant, helpful banking dialogue\n", - "- **Context Awareness**: Responses should be appropriate for the banking conversation context\n", - "- **User Satisfaction**: Answers should be useful and actionable for banking users\n", - "- **Clarity**: Banking information should be presented in a way that directly helps the user\n", - "\n", - "High relevancy indicates the banking agent successfully understands user needs and provides targeted, helpful banking responses." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " params={\n", - " \"user_input_column\": \"input\",\n", - " \"response_column\": \"banking_agent_model_prediction\",\n", - " \"retrieved_contexts_column\": \"banking_agent_model_tool_messages\",\n", - " }\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Recall\n", - "\n", - "Context Recall measures how well the banking agent utilizes the information retrieved from tools when generating its responses. This metric evaluates:\n", - "\n", - "**Information Utilization**: Whether the agent effectively incorporates tool outputs into its responses\n", - "- **Coverage**: How much of the available tool information is used in the response\n", - "- **Integration**: How well tool outputs are woven into coherent, natural banking responses\n", - "- **Completeness**: Whether all relevant information from tools is considered\n", - "\n", - "**Tool Effectiveness**: Assesses whether selected banking tools provide useful context for responses\n", - "- **Relevance**: Whether tool outputs actually help answer the user's banking question\n", - "- **Sufficiency**: Whether enough information was retrieved to generate good banking responses\n", - "- **Quality**: Whether the tools provided accurate, helpful banking information\n", - "\n", - "High context recall indicates the banking agent not only selects the right tools but also effectively uses their outputs to create comprehensive, well-informed banking responses." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextRecall\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " param_grid={\n", - " \"user_input_column\": [\"input\"],\n", - " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", - " \"reference_column\": [\"banking_agent_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Safety\n", - "\n", - "Safety testing is critical for banking AI agents to ensure they operate reliably and securely.\n", - "These tests help validate that our banking agent maintains high standards of fairness and professionalism." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### AspectCritic\n", - "\n", - "AspectCritic provides comprehensive evaluation across multiple dimensions of banking agent performance. This metric analyzes various aspects of response quality:\n", - "\n", - "**Multi-Dimensional Assessment**: Evaluates responses across different quality criteria:\n", - " - **Conciseness**: Whether responses are clear and to-the-point without unnecessary details\n", - " - **Coherence**: Whether responses are logically structured and easy to follow\n", - " - **Correctness**: Accuracy of banking information and appropriateness of recommendations\n", - " - **Harmfulness**: Whether responses could cause harm or damage to users or systems\n", - " - **Maliciousness**: Whether responses contain malicious content or intent\n", - "\n", - "**Holistic Quality Scoring**: Provides an overall assessment that considers:\n", - "- **User Experience**: How satisfying and useful the banking interaction would be for real users\n", - "- **Professional Standards**: Whether responses meet quality expectations for production banking systems\n", - "- **Consistency**: Whether the banking agent maintains quality across different types of requests\n", - "\n", - "AspectCritic helps identify specific areas where the banking agent excels or needs improvement, providing actionable insights for enhancing overall performance and user satisfaction in banking scenarios." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.AspectCritic\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " param_grid={\n", - " \"user_input_column\": [\"input\"],\n", - " \"response_column\": [\"banking_agent_model_prediction\"],\n", - " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Prompt bias\n", - "\n", - "Let's check if the agent's prompts contain unintended biases that could affect banking decisions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Bias\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Toxicity\n", - "\n", - "Let's ensure responses are professional and appropriate for banking contexts." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.Toxicity\",\n", - " inputs={\n", - " \"dataset\": vm_test_dataset,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Demo Summary and Next Steps\n", - "\n", - "We have successfully built and tested a comprehensive **Banking AI Agent** using LangGraph and ValidMind. Here's what we've accomplished:\n", - "\n", - "\n", - "\n", - "### What We Built\n", - "\n", - "1. **5 Specialized Banking Tools**\n", - " - Credit Risk Analyzer for loan assessments\n", - " - Customer Account Manager for account services\n", - " - Fraud Detection System for security monitoring\n", - "\n", - "2. **Intelligent LangGraph Agent**\n", - " - Automatic tool selection based on user requests\n", - " - Banking-specific system prompts and guidance\n", - " - Professional banking assistance and responses\n", - "\n", - "3. **Comprehensive Testing Framework**\n", - " - banking-specific test cases\n", - " - ValidMind integration for validation\n", - " - Performance analysis across banking domains\n", - "\n", - "\n", - "\n", - "### Next Steps\n", - "\n", - "1. **Customize Tools**: Adapt the banking tools to your specific banking requirements\n", - "2. **Expand Test Cases**: Add more banking scenarios and edge cases\n", - "3. **Integrate with Real Data**: Connect to actual banking systems and databases\n", - "4. **Add More Tools**: Implement additional banking-specific functionality\n", - "5. **Production Deployment**: Deploy the agent in a production banking environment\n", - "\n", - "\n", - "\n", - "### Key Benefits\n", - "\n", - "- **Industry-Specific**: Designed specifically for banking operations\n", - "- **Regulatory Compliance**: Built-in SR 11-7 and SS 1-23 compliance checks\n", - "- **Risk Management**: Comprehensive credit and fraud risk assessment\n", - "- **Customer Focus**: Tools for both retail and commercial banking needs\n", - "- **Real-World Applicability**: Addresses actual banking use cases and challenges\n", - "\n", - "Your banking AI agent is now ready to handle real-world banking scenarios while maintaining regulatory compliance and risk management best practices!" - ] - }, - { - "cell_type": "markdown", - "id": "copyright-e7184e5605bb4f85b3d7b8306aaaef78", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ValidMind (Poetry)", - "language": "python", - "name": "validmind" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 9053cd396e7b6792a00857e52e6e3aa83abcc7f1 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 15:20:46 -0800 Subject: [PATCH 51/54] Running make copyright --- .../agents/document_agentic_ai.ipynb | 4415 +++++++++-------- 1 file changed, 2230 insertions(+), 2185 deletions(-) diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index efadb761e..d5203a92f 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -1,2187 +1,2232 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "e7277c38", - "metadata": {}, - "source": [ - "# Document an agentic AI system\n", - "\n", - "Build and document an agentic AI system with the ValidMind Library. Construct a LangGraph-based banking agent, assign AI evaluation metric scores to your agent, and run accuracy, RAGAS, and safety tests, then log those test results to the ValidMind Platform.\n", - "\n", - "An _AI agent_ is an autonomous system that interprets inputs, selects from available tools or actions, and executes multi-step behaviors to achieve defined goals. In this notebook, the agent acts as a banking assistant that analyzes user requests and automatically selects and invokes the appropriate specialized banking tool to deliver accurate, compliant, and actionable responses.\n", - "\n", - "- This agent enables financial institutions to automate complex banking workflows where different customer requests require different specialized tools and knowledge bases.\n", - "- Effective validation of agentic AI systems reduces the risks of agents misinterpreting inputs, failing to extract required parameters, or producing incorrect assessments or actions — such as selecting the wrong tool.\n", - "\n", - "
For the LLM components in this notebook to function properly, you'll need access to OpenAI.\n", - "

\n", - "Before you continue, ensure that a valid OPENAI_API_KEY is set in your .env file.
" - ] - }, - { - "cell_type": "markdown", - "id": "a47dd942", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - "- [About ValidMind](#toc1__) \n", - " - [Before you begin](#toc1_1__) \n", - " - [New to ValidMind?](#toc1_2__) \n", - " - [Key concepts](#toc1_3__) \n", - "- [Setting up](#toc2__) \n", - " - [Install the ValidMind Library](#toc2_1__) \n", - " - [Initialize the ValidMind Library](#toc2_2__) \n", - " - [Register sample model](#toc2_2_1__) \n", - " - [Apply documentation template](#toc2_2_2__) \n", - " - [Get your code snippet](#toc2_2_3__) \n", - " - [Preview the documentation template](#toc2_2_4__) \n", - " - [Verify OpenAI API access](#toc2_3__) \n", - " - [Initialize the Python environment](#toc2_4__) \n", - "- [Building the LangGraph agent](#toc3__) \n", - " - [Test available banking tools](#toc3_1__) \n", - " - [Create LangGraph banking agent](#toc3_2__) \n", - " - [Define system prompt](#toc3_2_1__) \n", - " - [Initialize the LLM](#toc3_2_2__) \n", - " - [Define agent state structure](#toc3_2_3__) \n", - " - [Create agent workflow function](#toc3_2_4__) \n", - " - [Instantiate the banking agent](#toc3_2_5__) \n", - " - [Integrate agent with ValidMind](#toc3_3__) \n", - " - [Import ValidMind components](#toc3_3_1__) \n", - " - [Create agent wrapper function](#toc3_3_2__) \n", - " - [Initialize the ValidMind model object](#toc3_3_3__) \n", - " - [Store the agent reference](#toc3_3_4__) \n", - " - [Verify integration](#toc3_3_5__) \n", - " - [Validate the system prompt](#toc3_4__) \n", - "- [Initialize the ValidMind datasets](#toc4__) \n", - " - [Assign predictions](#toc4_1__) \n", - "- [Running accuracy tests](#toc5__) \n", - " - [Response accuracy test](#toc5_1__) \n", - " - [Tool selection accuracy test](#toc5_2__) \n", - "- [Assigning AI evaluation metric scores](#toc6__) \n", - " - [Identify relevant DeepEval scorers](#toc6_1__) \n", - " - [Assign reasoning scores](#toc6_2__) \n", - " - [Plan quality score](#toc6_2_1__) \n", - " - [Plan adherence score](#toc6_2_2__) \n", - " - [Assign action scores](#toc6_3__) \n", - " - [Tool correctness score](#toc6_3_1__) \n", - " - [Argument correctness score](#toc6_3_2__) \n", - " - [Assign execution scores](#toc6_4__) \n", - " - [Task completion score](#toc6_4_1__) \n", - " - [Step efficiency score](#toc6_4_2__) \n", - "- [Running RAGAS tests](#toc7__) \n", - " - [Identify relevant RAGAS tests](#toc7_1__) \n", - " - [Faithfulness](#toc7_1_1__) \n", - " - [Response Relevancy](#toc7_1_2__) \n", - " - [Context Recall](#toc7_1_3__) \n", - "- [Running safety tests](#toc8__) \n", - " - [AspectCritic](#toc8_1_1__) \n", - " - [Bias](#toc8_1_2__) \n", - "- [Next steps](#toc9__) \n", - " - [Work with your model documentation](#toc9_1__) \n", - " - [Customize the banking agent for your use case](#toc9_2__) \n", - " - [Discover more learning resources](#toc9_3__) \n", - "- [Upgrade ValidMind](#toc10__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "id": "ecaad35f", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. \n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators." - ] - }, - { - "cell_type": "markdown", - "id": "6ff1f9ef", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html)." - ] - }, - { - "cell_type": "markdown", - "id": "d7ad8d8c", - "metadata": {}, - "source": [ - "\n", - "\n", - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
" - ] - }, - { - "cell_type": "markdown", - "id": "323caa59", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Key concepts\n", - "\n", - "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", - "\n", - "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Metrics**: A subset of tests that do not have thresholds. In the context of this notebook, metrics and tests can be thought of as interchangeable concepts.\n", - "\n", - "**Custom metrics**: Custom metrics are functions that you define to evaluate your model or dataset. These functions can be registered with the ValidMind Library to be used in the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Custom metrics can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", - "\n", - "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." - ] - }, - { - "cell_type": "markdown", - "id": "ddba5169", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Install the ValidMind Library\n", - "\n", - "
Recommended Python versions\n", - "

\n", - "Python 3.8 <= x <= 3.11
\n", - "\n", - "Let's begin by installing the ValidMind Library with large language model (LLM) support:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1982a118", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q \"validmind[llm]\" \"langgraph==0.3.21\"" - ] - }, - { - "cell_type": "markdown", - "id": "dc9dea3a", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "id": "5848461e", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook.\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "id": "97d0b04b", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Agentic AI`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "id": "b279d5fa", - "metadata": {}, - "source": [ - "
Can't select this template?\n", - "

\n", - "Your organization administrators may need to add it to your template library:\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6ccbefc", - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Preview the documentation template\n", - "\n", - "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", - "\n", - "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dffdaa6f", - "metadata": {}, - "outputs": [], - "source": [ - "vm.preview_template()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Verify OpenAI API access\n", - "\n", - "Verify that a valid `OPENAI_API_KEY` is set in your `.env` file:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22cc39cb", - "metadata": {}, - "outputs": [], - "source": [ - "# Load environment variables if using .env file\n", - "try:\n", - " from dotenv import load_dotenv\n", - " load_dotenv()\n", - "except ImportError:\n", - " print(\"dotenv not installed. Make sure OPENAI_API_KEY is set in your environment.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the Python environment\n", - "\n", - "Let's import all the necessary libraries to prepare for building our banking LangGraph agentic system:\n", - "\n", - "- **Standard libraries** for data handling and environment management.\n", - "- **pandas**, a Python library for data manipulation and analytics, as an alias. We'll also configure pandas to show all columns and all rows at full width for easier debugging and inspection.\n", - "- **LangChain** components for LLM integration and tool management.\n", - "- **LangGraph** for building stateful, multi-step agent workflows.\n", - "- **Banking tools** for specialized financial services as defined in [banking_tools.py](banking_tools.py)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2058d1ac", - "metadata": {}, - "outputs": [], - "source": [ - "# STANDARD LIBRARY IMPORTS\n", - "\n", - "# TypedDict: Defines type-safe dictionaries for the agent's state structure\n", - "# Annotated: Adds metadata to type hints\n", - "# Sequence: Type hint for sequences used in the agent\n", - "from typing import TypedDict, Annotated, Sequence\n", - "\n", - "# THIRD PARTY IMPORTS\n", - "\n", - "import pandas as pd\n", - "# Configure pandas to show all columns and all rows at full width\n", - "pd.set_option('display.max_columns', None)\n", - "pd.set_option('display.max_colwidth', None)\n", - "pd.set_option('display.width', None)\n", - "pd.set_option('display.max_rows', None)\n", - "\n", - "# BaseMessage: Represents a base message in the LangChain message system\n", - "# HumanMessage: Represents a human message in the LangChain message system\n", - "# SystemMessage: Represents a system message in the LangChain message system\n", - "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", - "\n", - "# ChatOpenAI: Represents an OpenAI chat model in the LangChain library\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "# MemorySaver: Represents a checkpoint for saving and restoring agent state\n", - "from langgraph.checkpoint.memory import MemorySaver\n", - "\n", - "# StateGraph: Represents a stateful graph in the LangGraph library\n", - "# END: Represents the end of a graph\n", - "# START: Represents the start of a graph\n", - "from langgraph.graph import StateGraph, END, START\n", - "\n", - "# add_messages: Adds messages to the state\n", - "from langgraph.graph.message import add_messages\n", - "\n", - "# ToolNode: Represents a tool node in the LangGraph library\n", - "from langgraph.prebuilt import ToolNode\n", - "\n", - "# LOCAL IMPORTS FROM banking_tools.py\n", - "\n", - "from banking_tools import AVAILABLE_TOOLS" - ] - }, - { - "cell_type": "markdown", - "id": "e109d075", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Building the LangGraph agent" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Test available banking tools\n", - "\n", - "We'll use the demo banking tools defined in `banking_tools.py` that provide use cases of financial services:\n", - "\n", - "- **Credit Risk Analyzer** - Loan applications and credit decisions\n", - "- **Customer Account Manager** - Account services and customer support\n", - "- **Fraud Detection System** - Security and fraud prevention" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e0a120c", - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Available tools: {len(AVAILABLE_TOOLS)}\")\n", - "print(\"\\nTool Details:\")\n", - "for i, tool in enumerate(AVAILABLE_TOOLS, 1):\n", - " print(f\" - {tool.name}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's test each banking tool individually to ensure they're working correctly before integrating them into our agent:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc0caff2", - "metadata": {}, - "outputs": [], - "source": [ - "# Test 1: Credit Risk Analyzer\n", - "print(\"TEST 1: Credit Risk Analyzer\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " # Access the underlying function using .func\n", - " credit_result = AVAILABLE_TOOLS[0].func(\n", - " customer_income=75000,\n", - " customer_debt=1200,\n", - " credit_score=720,\n", - " loan_amount=50000,\n", - " loan_type=\"personal\"\n", - " )\n", - " print(credit_result)\n", - " print(\"Credit Risk Analyzer test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Credit Risk Analyzer test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6b227db", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Test 2: Customer Account Manager\n", - "print(\"TEST 2: Customer Account Manager\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " # Test checking balance\n", - " account_result = AVAILABLE_TOOLS[1].func(\n", - " account_type=\"checking\",\n", - " customer_id=\"12345\",\n", - " action=\"check_balance\"\n", - " )\n", - " print(account_result)\n", - "\n", - " # Test getting account info\n", - " info_result = AVAILABLE_TOOLS[1].func(\n", - " account_type=\"all\",\n", - " customer_id=\"12345\", \n", - " action=\"get_info\"\n", - " )\n", - " print(info_result)\n", - " print(\"Customer Account Manager test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Customer Account Manager test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a983b30d", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Test 3: Fraud Detection System\n", - "print(\"TEST 3: Fraud Detection System\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " fraud_result = AVAILABLE_TOOLS[2].func(\n", - " transaction_id=\"TX123\",\n", - " customer_id=\"12345\",\n", - " transaction_amount=500.00,\n", - " transaction_type=\"withdrawal\",\n", - " location=\"Miami, FL\",\n", - " device_id=\"DEVICE_001\"\n", - " )\n", - " print(fraud_result)\n", - " print(\"Fraud Detection System test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Fraud Detection System test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)" - ] - }, - { - "cell_type": "markdown", - "id": "6bf04845", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Create LangGraph banking agent\n", - "\n", - "With our tools ready to go, we'll create our intelligent banking agent with LangGraph that automatically selects and uses the appropriate banking tool based on a user request." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Define system prompt\n", - "\n", - "We'll begin by defining our system prompt, which provides the LLM with context about its role as a banking assistant and guidance on when to use each available tool:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7971c427", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Enhanced banking system prompt with tool selection guidance\n", - "system_context = \"\"\"You are a professional banking AI assistant with access to specialized banking tools.\n", - " Analyze the user's banking request and directly use the most appropriate tools to help them.\n", - " \n", - " AVAILABLE BANKING TOOLS:\n", - " \n", - " credit_risk_analyzer - Analyze credit risk for loan applications and credit decisions\n", - " - Use for: loan applications, credit assessments, risk analysis, mortgage eligibility\n", - " - Examples: \"Analyze credit risk for $50k personal loan\", \"Assess mortgage eligibility for $300k home purchase\"\n", - " - Parameters: customer_income, customer_debt, credit_score, loan_amount, loan_type\n", - "\n", - " customer_account_manager - Manage customer accounts and provide banking services\n", - " - Use for: account information, transaction processing, product recommendations, customer service\n", - " - Examples: \"Check balance for checking account 12345\", \"Recommend products for customer with high balance\"\n", - " - Parameters: account_type, customer_id, action, amount, account_details\n", - "\n", - " fraud_detection_system - Analyze transactions for potential fraud and security risks\n", - " - Use for: transaction monitoring, fraud prevention, risk assessment, security alerts\n", - " - Examples: \"Analyze fraud risk for $500 ATM withdrawal in Miami\", \"Check security for $2000 online purchase\"\n", - " - Parameters: transaction_id, customer_id, transaction_amount, transaction_type, location, device_id\n", - "\n", - " BANKING INSTRUCTIONS:\n", - " - Analyze the user's banking request carefully and identify the primary need\n", - " - If they need credit analysis → use credit_risk_analyzer\n", - " - If they need financial calculations → use financial_calculator\n", - " - If they need account services → use customer_account_manager\n", - " - If they need security analysis → use fraud_detection_system\n", - " - Extract relevant parameters from the user's request\n", - " - Provide helpful, accurate banking responses based on tool outputs\n", - " - Always consider banking regulations, risk management, and best practices\n", - " - Be professional and thorough in your analysis\n", - "\n", - " Choose and use tools wisely to provide the most helpful banking assistance.\n", - " Describe the response in user friendly manner with details describing the tool output. \n", - " Provide the response in at least 500 words.\n", - " Generate a concise execution plan for the banking request.\n", - " \"\"\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Initialize the LLM\n", - "\n", - "Let's initialize the LLM that will power our banking agent:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "866066e7", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the main LLM for banking responses\n", - "main_llm = ChatOpenAI(\n", - " model=\"gpt-5-mini\",\n", - " reasoning={\n", - " \"effort\": \"low\",\n", - " \"summary\": \"auto\"\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then bind the available banking tools to the LLM, enabling the model to automatically recognize and invoke each tool when appropriate based on request input and the system prompt we defined above:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "906d8132", - "metadata": {}, - "outputs": [], - "source": [ - "# Bind all banking tools to the main LLM\n", - "llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Define agent state structure\n", - "\n", - "The agent state defines the data structure that flows through the LangGraph workflow. It includes:\n", - "\n", - "- **messages** — The conversation history between the user and agent\n", - "- **user_input** — The current user request\n", - "- **session_id** — A unique identifier for the conversation session\n", - "- **context** — Additional context that can be passed between nodes\n", - "\n", - "Defining this state structure maintains the structure throughout the agent's execution and allows for multi-turn conversations with memory:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b926ddf", - "metadata": {}, - "outputs": [], - "source": [ - "# Banking Agent State Definition\n", - "class BankingAgentState(TypedDict):\n", - " messages: Annotated[Sequence[BaseMessage], add_messages]\n", - " user_input: str\n", - " session_id: str\n", - " context: dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Create agent workflow function\n", - "\n", - "We'll build the LangGraph agent workflow with two main components:\n", - "\n", - "1. **LLM node** — Processes user requests, applies the system prompt, and decides whether to use tools.\n", - "2. **Tools node** — Executes the selected banking tools when the LLM determines they're needed.\n", - "\n", - "The workflow begins with the LLM analyzing the request, then uses tools if needed — or ends if the response is complete, and finally returns to the LLM to generate the final response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c9bf585", - "metadata": {}, - "outputs": [], - "source": [ - "def create_banking_langgraph_agent():\n", - " \"\"\"Create a comprehensive LangGraph banking agent with intelligent tool selection.\"\"\"\n", - " def llm_node(state: BankingAgentState) -> BankingAgentState:\n", - " \"\"\"Main LLM node that processes banking requests and selects appropriate tools.\"\"\"\n", - " messages = state[\"messages\"]\n", - " # Add system context to messages\n", - " enhanced_messages = [SystemMessage(content=system_context)] + list(messages)\n", - " # Get LLM response with tool selection\n", - " response = llm_with_tools.invoke(enhanced_messages)\n", - " return {\n", - " **state,\n", - " \"messages\": messages + [response]\n", - " }\n", - " \n", - " def should_continue(state: BankingAgentState) -> str:\n", - " \"\"\"Decide whether to use tools or end the conversation.\"\"\"\n", - " last_message = state[\"messages\"][-1]\n", - " # Check if the LLM wants to use tools\n", - " if hasattr(last_message, 'tool_calls') and last_message.tool_calls:\n", - " return \"tools\"\n", - " return END\n", - " \n", - " # Create the banking state graph\n", - " workflow = StateGraph(BankingAgentState)\n", - " # Add nodes\n", - " workflow.add_node(\"llm\", llm_node)\n", - " workflow.add_node(\"tools\", ToolNode(AVAILABLE_TOOLS))\n", - " # Simplified entry point - go directly to LLM\n", - " workflow.add_edge(START, \"llm\")\n", - " # From LLM, decide whether to use tools or end\n", - " workflow.add_conditional_edges(\n", - " \"llm\",\n", - " should_continue,\n", - " {\"tools\": \"tools\", END: END}\n", - " )\n", - " # Tool execution flows back to LLM for final response\n", - " workflow.add_edge(\"tools\", \"llm\")\n", - " # Set up memory\n", - " memory = MemorySaver()\n", - " # Compile the graph\n", - " agent = workflow.compile(checkpointer=memory)\n", - " return agent" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Instantiate the banking agent\n", - "\n", - "Now, we'll create an instance of the banking agent by calling the workflow creation function.\n", - "\n", - "This compiled agent is ready to process banking requests and will automatically select and use the appropriate tools based on user queries:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "455b8ee4", - "metadata": {}, - "outputs": [], - "source": [ - "# Create the banking intelligent agent\n", - "banking_agent = create_banking_langgraph_agent()\n", - "\n", - "print(\"Banking LangGraph Agent Created Successfully!\")\n", - "print(\"\\nFeatures:\")\n", - "print(\" - Intelligent banking tool selection\")\n", - "print(\" - Comprehensive banking system prompt\")\n", - "print(\" - Streamlined workflow: LLM → Tools → Response\")\n", - "print(\" - Automatic tool parameter extraction\")\n", - "print(\" - Professional banking assistance\")" - ] - }, - { - "cell_type": "markdown", - "id": "12691528", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Integrate agent with ValidMind\n", - "\n", - "To integrate our LangGraph banking agent with ValidMind, we need to create a wrapper function that ValidMind can use to invoke the agent and extract the necessary information for testing and documentation, allowing ValidMind to run validation tests on the agent's behavior, tool usage, and responses." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Import ValidMind components\n", - "\n", - "We'll start with importing the necessary ValidMind components for integrating our agent:\n", - "\n", - "- `Prompt` from `validmind.models` for handling prompt-based model inputs\n", - "- `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list` from `validmind.scorers.llm.deepeval` for extracting and converting tool calls from agent outputs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9aeb8969", - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.models import Prompt\n", - "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Create agent wrapper function\n", - "\n", - "We'll then create a wrapper function that:\n", - "\n", - "- Accepts input in ValidMind's expected format (with `input` and `session_id` fields)\n", - "- Invokes the banking agent with the proper state initialization\n", - "- Captures tool outputs and tool calls for evaluation\n", - "- Returns a standardized response format that includes the prediction, full output, tool messages, and tool call information\n", - "- Handles errors gracefully with fallback responses" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e4d5a82", - "metadata": {}, - "outputs": [], - "source": [ - "def banking_agent_fn(input):\n", - " \"\"\"\n", - " Invoke the banking agent with the given input.\n", - " \"\"\"\n", - " try:\n", - " # Initial state for banking agent\n", - " initial_state = {\n", - " \"user_input\": input[\"input\"],\n", - " \"messages\": [HumanMessage(content=input[\"input\"])],\n", - " \"session_id\": input[\"session_id\"],\n", - " \"context\": {}\n", - " }\n", - " session_config = {\"configurable\": {\"thread_id\": input[\"session_id\"]}}\n", - " result = banking_agent.invoke(initial_state, config=session_config)\n", - "\n", - " from utils import capture_tool_output_messages\n", - "\n", - " # Capture all tool outputs and metadata\n", - " captured_data = capture_tool_output_messages(result)\n", - " \n", - " # Access specific tool outputs, this will be used for RAGAS tests\n", - " tool_message = \"\"\n", - " for output in captured_data[\"tool_outputs\"]:\n", - " tool_message += output['content']\n", - " \n", - " tool_calls_found = []\n", - " messages = result['messages']\n", - " for message in messages:\n", - " if hasattr(message, 'tool_calls') and message.tool_calls:\n", - " for tool_call in message.tool_calls:\n", - " # Handle both dictionary and object formats\n", - " if isinstance(tool_call, dict):\n", - " tool_calls_found.append(tool_call['name'])\n", - " else:\n", - " # ToolCall object - use attribute access\n", - " tool_calls_found.append(tool_call.name)\n", - "\n", - "\n", - " return {\n", - " \"prediction\": result['messages'][-1].content[0]['text'],\n", - " \"output\": result,\n", - " \"tool_messages\": [tool_message],\n", - " # \"tool_calls\": tool_calls_found,\n", - " \"tool_called\": _convert_to_tool_call_list(extract_tool_calls_from_agent_output(result))\n", - " }\n", - " except Exception as e:\n", - " # Return a fallback response if the agent fails\n", - " error_message = f\"\"\"I apologize, but I encountered an error while processing your banking request: {str(e)}.\n", - " Please try rephrasing your question or contact support if the issue persists.\"\"\"\n", - " return {\n", - " \"prediction\": error_message, \n", - " \"output\": {\n", - " \"messages\": [HumanMessage(content=input[\"input\"]), SystemMessage(content=error_message)],\n", - " \"error\": str(e)\n", - " }\n", - " }" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Initialize the ValidMind model object\n", - "\n", - "We'll also need to register the banking agent as a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data.\n", - "\n", - "You simply initialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model) that:\n", - "\n", - "- Associates the wrapper function with the model for prediction\n", - "- Stores the system prompt template for documentation\n", - "- Provides a unique `input_id` for tracking and identification\n", - "- Enables the agent to be used with ValidMind's testing and documentation features" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60a2ce7a", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the agent as a model\n", - "vm_banking_model = vm.init_model(\n", - " input_id=\"banking_agent_model\",\n", - " predict_fn=banking_agent_fn,\n", - " prompt=Prompt(template=system_context)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Store the agent reference\n", - "\n", - "We'll also store a reference to the original banking agent object in the ValidMind model. This allows us to access the full agent functionality directly if needed, while still maintaining the wrapper function interface for ValidMind's testing framework." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c653471", - "metadata": {}, - "outputs": [], - "source": [ - "# Add the banking agent to the vm model\n", - "vm_banking_model.model = banking_agent" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Verify integration\n", - "\n", - "Let's confirm that the banking agent has been successfully integrated with ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e101b0f", - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Banking Agent Successfully Integrated with ValidMind!\")\n", - "print(f\"Model ID: {vm_banking_model.input_id}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Validate the system prompt\n", - "\n", - "Let's get an initial sense of how well our defined system prompt meets a few best practices for prompt engineering by running a few tests — we'll run evaluation tests later on our agent's performance.\n", - "\n", - "You run individual tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module. Passing in our agentic model as an input, the tests below rate the prompt on a scale of 1-10 against the following criteria:\n", - "\n", - "- **[Clarity](https://docs.validmind.ai/tests/prompt_validation/Clarity.html)** — How clearly the prompt states the task.\n", - "- **[Conciseness](https://docs.validmind.ai/tests/prompt_validation/Conciseness.html)** — How succinctly the prompt states the task.\n", - "- **[Delimitation](https://docs.validmind.ai/tests/prompt_validation/Delimitation.html)** — When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", - "- **[NegativeInstruction](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — Whether the prompt contains negative instructions.\n", - "- **[Specificity](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — How specific the prompt defines the task." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f52dceb1", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Clarity\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "70d52333", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Conciseness\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5aa89976", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Delimitation\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8630197e", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.NegativeInstruction\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bba99915", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Specificity\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Initialize the ValidMind datasets\n", - "\n", - "After validation our system prompt, let's import our sample dataset ([banking_test_dataset.py](banking_test_dataset.py)), which we'll use in the next section to evaluate our agent's performance across different banking scenarios:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c70ca2c", - "metadata": {}, - "outputs": [], - "source": [ - "from banking_test_dataset import banking_test_dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next step is to connect your data with a ValidMind `Dataset` object. **This step is always necessary every time you want to connect a dataset to documentation and produce test results through ValidMind,** but you only need to do it once per dataset.\n", - "\n", - "Initialize a ValidMind dataset object using the [`init_dataset` function](https://docs.validmind.ai/validmind/validmind.html#init_dataset) from the ValidMind (`vm`) module. For this example, we'll pass in the following arguments:\n", - "\n", - "- **`input_id`** — A unique identifier that allows tracking what inputs are used when running each individual test.\n", - "- **`dataset`** — The raw dataset that you want to provide as input to tests.\n", - "- **`text_column`** — The name of the column containing the text input data.\n", - "- **`target_column`** — A required argument if tests require access to true values. This is the name of the target column in the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7e9d158", - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset = vm.init_dataset(\n", - " input_id=\"banking_test_dataset\",\n", - " dataset=banking_test_dataset,\n", - " text_column=\"input\",\n", - " target_column=\"possible_outputs\",\n", - ")\n", - "\n", - "print(\"Banking Test Dataset Initialized in ValidMind!\")\n", - "print(f\"Dataset ID: {vm_test_dataset.input_id}\")\n", - "print(f\"Dataset columns: {vm_test_dataset._df.columns}\")\n", - "vm_test_dataset._df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Assign predictions\n", - "\n", - "Now that both the model object and the datasets have been registered, we'll assign predictions to capture the banking agent's responses for evaluation:\n", - "\n", - "- The [`assign_predictions()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#assign_predictions) from the `Dataset` object can link existing predictions to any number of models.\n", - "- This method links the model's class prediction values and probabilities to our `vm_train_ds` and `vm_test_ds` datasets.\n", - "\n", - "If no prediction values are passed, the method will compute predictions automatically:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d462663", - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_predictions(vm_banking_model)\n", - "\n", - "print(\"Banking Agent Predictions Generated Successfully!\")\n", - "print(f\"Predictions assigned to {len(vm_test_dataset._df)} test cases\")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "8e50467e", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Running accuracy tests\n", - "\n", - "Using [`@vm.test`](https://docs.validmind.ai/validmind/validmind.html#test), let's implement some reusable custom *inline tests* to assess the accuracy of our banking agent:\n", - "\n", - "- An inline test refers to a test written and executed within the same environment as the code being tested — in this case, right in this Jupyter Notebook — without requiring a separate test file or framework.\n", - "- You'll note that the custom test functions are just regular Python functions that can include and require any Python library as you see fit." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Response accuracy test\n", - "\n", - "We'll create a custom test that evaluates the banking agent's ability to provide accurate responses by:\n", - "\n", - "- Testing against a dataset of predefined banking questions and expected answers.\n", - "- Checking if responses contain expected keywords and banking terminology.\n", - "- Providing detailed test results including pass/fail status.\n", - "- Helping identify any gaps in the agent's banking knowledge or response quality." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90232066", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "@vm.test(\"my_custom_tests.banking_accuracy_test\")\n", - "def banking_accuracy_test(model, dataset, list_of_columns):\n", - " \"\"\"\n", - " The Banking Accuracy Test evaluates whether the agent’s responses include \n", - " critical domain-specific keywords and phrases that indicate accurate, compliant,\n", - " and contextually appropriate banking information. This test ensures that the agent\n", - " provides responses containing the expected banking terminology, risk classifications,\n", - " account details, or other domain-relevant information required for regulatory compliance,\n", - " customer safety, and operational accuracy.\n", - " \"\"\"\n", - " df = dataset._df\n", - " \n", - " # Pre-compute responses for all tests\n", - " y_true = dataset.y.tolist()\n", - " y_pred = dataset.y_pred(model).tolist()\n", - "\n", - " # Vectorized test results\n", - " test_results = []\n", - " for response, keywords in zip(y_pred, y_true):\n", - " # Convert keywords to list if not already a list\n", - " if not isinstance(keywords, list):\n", - " keywords = [keywords]\n", - " test_results.append(any(str(keyword).lower() in str(response).lower() for keyword in keywords))\n", - " \n", - " results = pd.DataFrame()\n", - " column_names = [col + \"_details\" for col in list_of_columns]\n", - " results[column_names] = df[list_of_columns]\n", - " results[\"actual\"] = y_pred\n", - " results[\"expected\"] = y_true\n", - " results[\"passed\"] = test_results\n", - " results[\"error\"] = None if test_results else f'Response did not contain any expected keywords: {y_true}'\n", - " \n", - " return results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we've defined our custom response accuracy test, we can run the test using the same `run_test()` function we used earlier to validate the system prompt using our sample dataset and agentic model as input, and log the test results to the ValidMind Platform with the [`log()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#log):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e68884d5", - "metadata": {}, - "outputs": [], - "source": [ - "result = vm.tests.run_test(\n", - " \"my_custom_tests.banking_accuracy_test\",\n", - " inputs={\n", - " \"dataset\": vm_test_dataset,\n", - " \"model\": vm_banking_model\n", - " },\n", - " params={\n", - " \"list_of_columns\": [\"input\"]\n", - " }\n", - ")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's review the first five rows of the test dataset to inspect the results to see how well the banking agent performed. Each column in the output serves a specific purpose in evaluating agent performance:\n", - "\n", - "| Column header | Description | Importance |\n", - "|--------------|-------------|------------|\n", - "| **`input`** | Original user query or request | Essential for understanding the context of each test case and tracing which inputs led to specific agent behaviors. |\n", - "| **`expected_tools`** | Banking tools that should be invoked for this request | Enables validation of correct tool selection, which is critical for agentic AI systems where choosing the right tool is a key success metric. |\n", - "| **`expected_output`** | Expected output or keywords that should appear in the response | Defines the success criteria for each test case, enabling objective evaluation of whether the agent produced the correct result. |\n", - "| **`session_id`** | Unique identifier for each test session | Allows tracking and correlation of related test runs, debugging specific sessions, and maintaining audit trails. |\n", - "| **`category`** | Classification of the request type | Helps organize test results by domain and identify performance patterns across different banking use cases. |\n", - "| **`banking_agent_model_output`** | Complete agent response including all messages and reasoning | Allows you to examine the full output to assess response quality, completeness, and correctness beyond just keyword matching. |\n", - "| **`banking_agent_model_tool_messages`** | Messages exchanged with the banking tools | Critical for understanding how the agent interacted with tools, what parameters were passed, and what tool outputs were received. |\n", - "| **`banking_agent_model_tool_called`** | Specific tool that was invoked | Enables validation that the agent selected the correct tool for each request, which is fundamental to agentic AI validation. |\n", - "| **`possible_outputs`** | Alternative valid outputs or keywords that could appear in the response | Provides flexibility in evaluation by accounting for multiple acceptable response formats or variations. |" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78f7edb1", - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.df.head(5)" - ] - }, - { - "cell_type": "markdown", - "id": "6f233bef", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Tool selection accuracy test\n", - "\n", - "We'll also create a custom test that evaluates the banking agent's ability to select the correct tools for different requests by:\n", - "\n", - "- Testing against a dataset of predefined banking queries with expected tool selections.\n", - "- Comparing the tools actually invoked by the agent against the expected tools for each request.\n", - "- Providing quantitative accuracy scores that measure the proportion of expected tools correctly selected.\n", - "- Helping identify gaps in the agent's understanding of user needs and tool selection logic." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, we'll define a helper function that extracts tool calls from the agent's messages and compares them against the expected tools. This function handles different message formats (dictionary or object) and calculates accuracy scores:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e68798be", - "metadata": {}, - "outputs": [], - "source": [ - "def validate_tool_calls_simple(messages, expected_tools):\n", - " \"\"\"Simple validation of tool calls without RAGAS dependency issues.\"\"\"\n", - " \n", - " tool_calls_found = []\n", - " \n", - " for message in messages:\n", - " if hasattr(message, 'tool_calls') and message.tool_calls:\n", - " for tool_call in message.tool_calls:\n", - " # Handle both dictionary and object formats\n", - " if isinstance(tool_call, dict):\n", - " tool_calls_found.append(tool_call['name'])\n", - " else:\n", - " # ToolCall object - use attribute access\n", - " tool_calls_found.append(tool_call.name)\n", - " \n", - " # Check if expected tools were called\n", - " accuracy = 0.0\n", - " matches = 0\n", - " if expected_tools:\n", - " matches = sum(1 for tool in expected_tools if tool in tool_calls_found)\n", - " accuracy = matches / len(expected_tools)\n", - " \n", - " return {\n", - " 'expected_tools': expected_tools,\n", - " 'found_tools': tool_calls_found,\n", - " 'matches': matches,\n", - " 'total_expected': len(expected_tools) if expected_tools else 0,\n", - " 'accuracy': accuracy,\n", - " }" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we'll define the main test function that uses the helper function to evaluate tool selection accuracy across all test cases in the dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "604d7313", - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_custom_tests.BankingToolCallAccuracy\")\n", - "def BankingToolCallAccuracy(dataset, agent_output_column, expected_tools_column):\n", - " \"\"\"\n", - " Evaluates the tool selection accuracy of a LangGraph-powered banking agent.\n", - "\n", - " This test measures whether the agent correctly identifies and invokes the required banking tools\n", - " for each user query scenario.\n", - " For each case, the outputs generated by the agent (including its tool calls) are compared against an\n", - " expected set of tools. The test considers both coverage and exactness: it computes the proportion of\n", - " expected tools correctly called by the agent for each instance.\n", - "\n", - " Parameters:\n", - " dataset (VMDataset): The dataset containing user queries, agent outputs, and ground-truth tool expectations.\n", - " agent_output_column (str): Dataset column name containing agent outputs (should include tool call details in 'messages').\n", - " expected_tools_column (str): Dataset column specifying the true expected tools (as lists).\n", - "\n", - " Returns:\n", - " List[dict]: Per-row dictionaries with details: expected tools, found tools, match count, total expected, and accuracy score.\n", - "\n", - " Purpose:\n", - " Provides diagnostic evidence of the banking agent's core reasoning ability—specifically, its capacity to\n", - " interpret user needs and select the correct banking actions. Useful for diagnosing gaps in tool coverage,\n", - " misclassifications, or breakdowns in agent logic.\n", - "\n", - " Interpretation:\n", - " - An accuracy of 1.0 signals perfect tool selection for that example.\n", - " - Lower scores may indicate partial or complete failures to invoke required tools.\n", - " - Review 'found_tools' vs. 'expected_tools' to understand the source of discrepancies.\n", - "\n", - " Strengths:\n", - " - Directly tests a core capability of compositional tool-use agents.\n", - " - Framework-agnostic; robust to tool call output format (object or dict).\n", - " - Supports batch validation and result logging for systematic documentation.\n", - "\n", - " Limitations:\n", - " - Does not penalize extra, unnecessary tool calls.\n", - " - Does not assess result quality—only correct invocation.\n", - "\n", - " \"\"\"\n", - " df = dataset._df\n", - " \n", - " results = []\n", - " for i, row in df.iterrows():\n", - " result = validate_tool_calls_simple(row[agent_output_column]['messages'], row[expected_tools_column])\n", - " results.append(result)\n", - " \n", - " return results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we can call our function with `run_test()` and log the test results to the ValidMind Platform:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd14115e", - "metadata": {}, - "outputs": [], - "source": [ - "result = vm.tests.run_test(\n", - " \"my_custom_tests.BankingToolCallAccuracy\",\n", - " inputs={\n", - " \"dataset\": vm_test_dataset,\n", - " },\n", - " params={\n", - " \"agent_output_column\": \"banking_agent_model_output\",\n", - " \"expected_tools_column\": \"expected_tools\"\n", - " }\n", - ")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "id": "f78f4107", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Assigning AI evaluation metric scores\n", - "\n", - "*AI agent evaluation metrics* are specialized measurements designed to assess how well autonomous LLM-based agents reason, plan, select and execute tools, and ultimately complete user tasks by analyzing the *full execution trace* — including reasoning steps, tool calls, intermediate decisions, and outcomes, rather than just single input–output pairs. These metrics are essential because agent failures often occur in ways traditional LLM metrics miss — for example, choosing the right tool with wrong arguments, creating a good plan but not following it, or completing a task inefficiently.\n", - "\n", - "In this section, we'll evaluate our banking agent's outputs and add scoring to our sample dataset against metrics defined in [DeepEval’s AI agent evaluation framework](https://deepeval.com/guides/guides-ai-agent-evaluation-metrics) which breaks down AI agent evaluation into three layers with corresponding subcategories: **reasoning**, **action**, and **execution**.\n", - "\n", - "Together, these three metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Identify relevant DeepEval scorers\n", - "\n", - "*Scorers* are evaluation metrics that analyze model outputs and store their results in the dataset:\n", - "\n", - "- Each scorer adds a new column to the dataset with format: `{scorer_name}_{metric_name}`\n", - "- The column contains the numeric score (typically `0`-`1`) for each example\n", - "- Multiple scorers can be run on the same dataset, each adding their own column\n", - "- Scores are persisted in the dataset for later analysis and visualization\n", - "- Common scorer patterns include:\n", - " - Model performance metrics (accuracy, F1, etc.)\n", - " - Output quality metrics (relevance, faithfulness)\n", - " - Task-specific metrics (completion, correctness)\n", - "\n", - "Use `list_scorers()` from [`validmind.scorers`](https://docs.validmind.ai/validmind/validmind/tests.html#scorer) to discover all available scoring methods and their IDs that can be used with `assign_scores()`. We'll filter these results to return only DeepEval scorers for our desired three metrics in a formatted table with descriptions:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "730c70ec", - "metadata": {}, - "outputs": [], - "source": [ - "# Load all DeepEval scorers\n", - "llm_scorers_dict = vm.tests.load._load_tests([s for s in vm.scorer.list_scorers() if \"deepeval\" in s.lower()])\n", - "\n", - "# Categorize scorers by metric layer\n", - "reasoning_scorers = {}\n", - "action_scorers = {}\n", - "execution_scorers = {}\n", - "\n", - "for scorer_id, scorer_func in llm_scorers_dict.items():\n", - " tags = getattr(scorer_func, \"__tags__\", [])\n", - " scorer_name = scorer_id.split(\".\")[-1]\n", - "\n", - " if \"reasoning_layer\" in tags:\n", - " reasoning_scorers[scorer_id] = scorer_func\n", - " elif \"action_layer\" in tags:\n", - " # StepEfficiency is tagged as action_layer but belongs to execution per DeepEval framework\n", - " if \"StepEfficiency\" in scorer_name:\n", - " execution_scorers[scorer_id] = scorer_func\n", - " else:\n", - " action_scorers[scorer_id] = scorer_func\n", - " elif \"TaskCompletion\" in scorer_name:\n", - " execution_scorers[scorer_id] = scorer_func\n", - "\n", - "# Display scorers by category\n", - "print(\"=\" * 80)\n", - "print(\"REASONING LAYER\")\n", - "print(\"=\" * 80)\n", - "if reasoning_scorers:\n", - " reasoning_df = vm.tests.load._pretty_list_tests(reasoning_scorers, truncate=True)\n", - " display(reasoning_df)\n", - "else:\n", - " print(\"No reasoning layer scorers found.\")\n", - "\n", - "print(\"\\n\" + \"=\" * 80)\n", - "print(\"ACTION LAYER\")\n", - "print(\"=\" * 80)\n", - "if action_scorers:\n", - " action_df = vm.tests.load._pretty_list_tests(action_scorers, truncate=True)\n", - " display(action_df)\n", - "else:\n", - " print(\"No action layer scorers found.\")\n", - "\n", - "print(\"\\n\" + \"=\" * 80)\n", - "print(\"EXECUTION LAYER\")\n", - "print(\"=\" * 80)\n", - "if execution_scorers:\n", - " execution_df = vm.tests.load._pretty_list_tests(execution_scorers, truncate=True)\n", - " display(execution_df)\n", - "else:\n", - " print(\"No execution layer scorers found.\")" - ] - }, - { - "cell_type": "markdown", - "id": "4dd73d0d", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Assign reasoning scores\n", - "\n", - "*Reasoning* evaluates planning and strategy generation:\n", - "\n", - "- **Plan quality** – How logical, complete, and efficient the agent’s plan is.\n", - "- **Plan adherence** – Whether the agent follows its own plan during execution." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Plan quality score\n", - "\n", - "Let's measure how well our banking agent generates a plan before acting. A high score means the plan is logical, complete, and efficient." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52f362ba", - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.PlanQuality\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Plan adherence score\n", - "\n", - "Let's check whether our banking agent follows the plan it created. Deviations lower this score and indicate gaps between reasoning and execution." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4124a7c2", - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.PlanAdherence\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " expected_output_column = \"expected_output\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "6da1ac95", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Assign action scores\n", - "\n", - "*Action* assesses tool usage and argument generation:\n", - "\n", - "- **Tool correctness** – Whether the agent selects and calls the right tools.\n", - "- **Argument correctness** – Whether the agent generates correct tool arguments." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Tool correctness score\n", - "\n", - "Let's evaluate if our banking agent selects the appropriate tool for the task. Choosing the wrong tool reduces performance even if reasoning was correct." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d2e8a25", - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.ToolCorrectness\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - " expected_tools_column = \"expected_tools\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Argument correctness score\n", - "\n", - "Let's assesses whether our banking agent provides correct inputs or arguments to the selected tool. Incorrect arguments can lead to failed or unexpected results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04f90489", - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.ArgumentCorrectness\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "c59e5595", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Assign execution scores\n", - "\n", - "*Execution* measures end-to-end performance:\n", - "\n", - "- **Task completion** – Whether the agent successfully completes the intended task.\n", - "- **Step efficiency** – Whether the agent avoids unnecessary or redundant steps." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Task completion score\n", - "\n", - "Let's evaluate whether our banking agent successfully completes the requested tasks. Incomplete task execution can lead to user dissatisfaction and failed banking operations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05024f1f", - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.TaskCompletion\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you recall from the beginning of this section, when we run scorers through `assign_scores()`, the return values are automatically processed and added as new columns with the format `{scorer_name}_{metric_name}`. Note that the task completion scorer has added a new column `TaskCompletion_score` to our dataset.\n", - "\n", - "We'll use this column to visualize the distribution of task completion scores across our test cases through the [BoxPlot test](https://docs.validmind.ai/validmind/validmind/tests/plots/BoxPlot.html#boxplot):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f6d08ca", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.plots.BoxPlot\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " params={\n", - " \"columns\": \"TaskCompletion_score\",\n", - " \"title\": \"Distribution of Task Completion Scores\",\n", - " \"ylabel\": \"Score\",\n", - " \"figsize\": (8, 6)\n", - " }\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Step efficiency score\n", - "\n", - "Let's evaluate whether our banking agent avoids unnecessary or redundant steps during task execution. Inefficient step sequences can lead to increased latency, higher costs, and poor user experience." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa6e154a", - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.StepEfficiency\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "012bbcb8", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Running RAGAS tests\n", - "\n", - "Next, let's run some out-of-the-box *Retrieval-Augmented Generation Assessment* (RAGAS) tests available in the ValidMind Library. RAGAS provides specialized metrics for evaluating retrieval-augmented generation systems and conversational AI agents. These metrics analyze different aspects of agent performance by assessing how well systems integrate retrieved information with generated responses.\n", - "\n", - "Our banking agent uses tools to retrieve information and generates responses based on that context, making it similar to a RAG system. RAGAS metrics help evaluate the quality of this integration by analyzing the relationship between retrieved tool outputs, user queries, and generated responses.\n", - "\n", - "These tests provide insights into how well our banking agent integrates tool usage with conversational abilities, ensuring it provides accurate, relevant, and helpful responses to banking users while maintaining fidelity to retrieved information." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Identify relevant RAGAS tests\n", - "\n", - "Let's explore some of ValidMind's available tests. Using ValidMind’s repository of tests streamlines your development testing, and helps you ensure that your models are being documented and evaluated appropriately.\n", - "\n", - "You can pass `tasks` and `tags` as parameters to the [`vm.tests.list_tests()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tests) to filter the tests based on the tags and task types:\n", - "\n", - "- **`tasks`** represent the kind of modeling task associated with a test. Here we'll focus on `text_qa` tasks.\n", - "- **`tags`** are free-form descriptions providing more details about the test, for example, what category the test falls into. Here we'll focus on the `ragas` tag.\n", - "\n", - "We'll then run three of these tests returned as examples below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0701f5a9", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.list_tests(task=\"text_qa\", tags=[\"ragas\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Faithfulness\n", - "\n", - "Let's evaluate whether the banking agent's responses accurately reflect the information retrieved from tools. Unfaithful responses can misreport credit analysis, financial calculations, and compliance results—undermining user trust in the banking agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92044533", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.model_validation.ragas.Faithfulness\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " param_grid={\n", - " \"user_input_column\": [\"input\"],\n", - " \"response_column\": [\"banking_agent_model_prediction\"],\n", - " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Response Relevancy\n", - "\n", - "Let's evaluate whether the banking agent's answers address the user's original question or request. Irrelevant or off-topic responses can frustrate users and fail to deliver the banking information they need." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7483bc3", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " params={\n", - " \"user_input_column\": \"input\",\n", - " \"response_column\": \"banking_agent_model_prediction\",\n", - " \"retrieved_contexts_column\": \"banking_agent_model_tool_messages\",\n", - " }\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Context Recall\n", - "\n", - "Let's evaluate how well the banking agent uses the information retrieved from tools when generating its responses. Poor context recall can lead to incomplete or underinformed answers even when the right tools were selected." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5dc00ce", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.model_validation.ragas.ContextRecall\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " param_grid={\n", - " \"user_input_column\": [\"input\"],\n", - " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", - " \"reference_column\": [\"banking_agent_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "b987b00e", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Running safety tests\n", - "\n", - "Finally, let's run some out-of-the-box *safety* tests available in the ValidMind Library. Safety tests provide specialized metrics for evaluating whether AI agents operate reliably and securely. These metrics analyze different aspects of agent behavior by assessing adherence to safety guidelines, consistency of outputs, and resistance to harmful or inappropriate requests.\n", - "\n", - "Our banking agent handles sensitive financial information and user requests, making safety and reliability essential. Safety tests help evaluate whether the agent maintains appropriate boundaries, responds consistently and correctly to inputs, and avoids generating harmful, biased, or unprofessional content.\n", - "\n", - "These tests provide insights into how well our banking agent upholds standards of fairness and professionalism, ensuring it operates reliably and securely for banking users." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### AspectCritic\n", - "\n", - "Let's evaluate our banking agent's responses across multiple quality dimensions — conciseness, coherence, correctness, harmfulness, and maliciousness. Weak performance on these dimensions can degrade user experience, fall short of professional banking standards, or introduce safety risks. \n", - "\n", - "We'll use the `AspectCritic` we identified earlier:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "148daa2b", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.model_validation.ragas.AspectCritic\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " param_grid={\n", - " \"user_input_column\": [\"input\"],\n", - " \"response_column\": [\"banking_agent_model_prediction\"],\n", - " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Bias\n", - "\n", - "Let's evaluate whether our banking agent's prompts contain unintended biases that could affect banking decisions. Biased prompts can lead to unfair or discriminatory outcomes — undermining customer trust and exposing the institution to compliance risk.\n", - "\n", - "We'll first use `list_tests()` again to filter for tests relating to `prompt_validation`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "74eba86c", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.list_tests(filter=\"prompt_validation\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And then run the identified `Bias` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "062cf8e7", - "metadata": {}, - "outputs": [], - "source": [ - "vm.tests.run_test(\n", - " \"validmind.prompt_validation.Bias\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "a2832750", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Next steps\n", - "\n", - "You can look at the output produced by the ValidMind Library right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation." - ] - }, - { - "cell_type": "markdown", - "id": "a8cb1a58", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Work with your model documentation\n", - "\n", - "1. From the **Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", - "\n", - "2. In the left sidebar that appears for your model, click **Documentation** under Documents.\n", - "\n", - " What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)\n", - "\n", - "3. Click into any section related to the tests we ran in this notebook, for example: **4.3. Prompt Evaluation** to review the results of the tests we logged." - ] - }, - { - "cell_type": "markdown", - "id": "94ef26be", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Customize the banking agent for your use case\n", - "\n", - "You've now built an agentic AI system designed for banking use cases that supports compliance with supervisory guidance such as SR 11-7 and SS1/23, covering credit and fraud risk assessment for both retail and commercial banking. Extend this example agent to real-world banking scenarios and production deployment by:\n", - "\n", - "- Adapting the banking tools to your organization's specific requirements\n", - "- Adding more banking scenarios and edge cases to your test set\n", - "- Connecting the agent to your banking systems and databases\n", - "- Implementing additional banking-specific tools and workflows" - ] - }, - { - "cell_type": "markdown", - "id": "a681e49c", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Discover more learning resources\n", - "\n", - "Learn more about the ValidMind Library tools we used in this notebook:\n", - "\n", - "- [Custom prompts](https://docs.validmind.ai/notebooks/how_to/customize_test_result_descriptions.html)\n", - "- [Custom tests](https://docs.validmind.ai/notebooks/code_samples/custom_tests/implement_custom_tests.html)\n", - "- [ValidMind scorers](https://docs.validmind.ai/notebooks/how_to/assign_scores_complete_tutorial.html)\n", - "\n", - "We also offer many more interactive notebooks to help you document models:\n", - "\n", - "- [Run tests & test suites](https://docs.validmind.ai/guide/testing-overview.html)\n", - "- [Code samples](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", - "\n", - "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Upgrade ValidMind\n", - "\n", - "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", - "\n", - "Retrieve the information for the currently installed version of ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9733adff", - "metadata": {}, - "outputs": [], - "source": [ - "%pip show validmind" - ] - }, - { - "cell_type": "markdown", - "id": "e4b0b646", - "metadata": {}, - "source": [ - "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", - "\n", - "```bash\n", - "%pip install --upgrade validmind\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "387fa7f1", - "metadata": {}, - "source": [ - "You may need to restart your kernel after running the upgrade package for changes to be applied." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ValidMind Library", - "language": "python", - "name": "validmind" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "e7277c38", + "metadata": {}, + "source": [ + "# Document an agentic AI system\n", + "\n", + "Build and document an agentic AI system with the ValidMind Library. Construct a LangGraph-based banking agent, assign AI evaluation metric scores to your agent, and run accuracy, RAGAS, and safety tests, then log those test results to the ValidMind Platform.\n", + "\n", + "An _AI agent_ is an autonomous system that interprets inputs, selects from available tools or actions, and executes multi-step behaviors to achieve defined goals. In this notebook, the agent acts as a banking assistant that analyzes user requests and automatically selects and invokes the appropriate specialized banking tool to deliver accurate, compliant, and actionable responses.\n", + "\n", + "- This agent enables financial institutions to automate complex banking workflows where different customer requests require different specialized tools and knowledge bases.\n", + "- Effective validation of agentic AI systems reduces the risks of agents misinterpreting inputs, failing to extract required parameters, or producing incorrect assessments or actions — such as selecting the wrong tool.\n", + "\n", + "
For the LLM components in this notebook to function properly, you'll need access to OpenAI.\n", + "

\n", + "Before you continue, ensure that a valid OPENAI_API_KEY is set in your .env file.
" + ] + }, + { + "cell_type": "markdown", + "id": "a47dd942", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1__) \n", + " - [Before you begin](#toc1_1__) \n", + " - [New to ValidMind?](#toc1_2__) \n", + " - [Key concepts](#toc1_3__) \n", + "- [Setting up](#toc2__) \n", + " - [Install the ValidMind Library](#toc2_1__) \n", + " - [Initialize the ValidMind Library](#toc2_2__) \n", + " - [Register sample model](#toc2_2_1__) \n", + " - [Apply documentation template](#toc2_2_2__) \n", + " - [Get your code snippet](#toc2_2_3__) \n", + " - [Preview the documentation template](#toc2_2_4__) \n", + " - [Verify OpenAI API access](#toc2_3__) \n", + " - [Initialize the Python environment](#toc2_4__) \n", + "- [Building the LangGraph agent](#toc3__) \n", + " - [Test available banking tools](#toc3_1__) \n", + " - [Create LangGraph banking agent](#toc3_2__) \n", + " - [Define system prompt](#toc3_2_1__) \n", + " - [Initialize the LLM](#toc3_2_2__) \n", + " - [Define agent state structure](#toc3_2_3__) \n", + " - [Create agent workflow function](#toc3_2_4__) \n", + " - [Instantiate the banking agent](#toc3_2_5__) \n", + " - [Integrate agent with ValidMind](#toc3_3__) \n", + " - [Import ValidMind components](#toc3_3_1__) \n", + " - [Create agent wrapper function](#toc3_3_2__) \n", + " - [Initialize the ValidMind model object](#toc3_3_3__) \n", + " - [Store the agent reference](#toc3_3_4__) \n", + " - [Verify integration](#toc3_3_5__) \n", + " - [Validate the system prompt](#toc3_4__) \n", + "- [Initialize the ValidMind datasets](#toc4__) \n", + " - [Assign predictions](#toc4_1__) \n", + "- [Running accuracy tests](#toc5__) \n", + " - [Response accuracy test](#toc5_1__) \n", + " - [Tool selection accuracy test](#toc5_2__) \n", + "- [Assigning AI evaluation metric scores](#toc6__) \n", + " - [Identify relevant DeepEval scorers](#toc6_1__) \n", + " - [Assign reasoning scores](#toc6_2__) \n", + " - [Plan quality score](#toc6_2_1__) \n", + " - [Plan adherence score](#toc6_2_2__) \n", + " - [Assign action scores](#toc6_3__) \n", + " - [Tool correctness score](#toc6_3_1__) \n", + " - [Argument correctness score](#toc6_3_2__) \n", + " - [Assign execution scores](#toc6_4__) \n", + " - [Task completion score](#toc6_4_1__) \n", + " - [Step efficiency score](#toc6_4_2__) \n", + "- [Running RAGAS tests](#toc7__) \n", + " - [Identify relevant RAGAS tests](#toc7_1__) \n", + " - [Faithfulness](#toc7_1_1__) \n", + " - [Response Relevancy](#toc7_1_2__) \n", + " - [Context Recall](#toc7_1_3__) \n", + "- [Running safety tests](#toc8__) \n", + " - [AspectCritic](#toc8_1_1__) \n", + " - [Bias](#toc8_1_2__) \n", + "- [Next steps](#toc9__) \n", + " - [Work with your model documentation](#toc9_1__) \n", + " - [Customize the banking agent for your use case](#toc9_2__) \n", + " - [Discover more learning resources](#toc9_3__) \n", + "- [Upgrade ValidMind](#toc10__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "ecaad35f", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. \n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators." + ] + }, + { + "cell_type": "markdown", + "id": "6ff1f9ef", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html)." + ] + }, + { + "cell_type": "markdown", + "id": "d7ad8d8c", + "metadata": {}, + "source": [ + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
" + ] + }, + { + "cell_type": "markdown", + "id": "323caa59", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Metrics**: A subset of tests that do not have thresholds. In the context of this notebook, metrics and tests can be thought of as interchangeable concepts.\n", + "\n", + "**Custom metrics**: Custom metrics are functions that you define to evaluate your model or dataset. These functions can be registered with the ValidMind Library to be used in the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom metrics can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "id": "ddba5169", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "id": "b53da99c", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "
Recommended Python versions\n", + "

\n", + "Python 3.8 <= x <= 3.11
\n", + "\n", + "Let's begin by installing the ValidMind Library with large language model (LLM) support:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1982a118", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q \"validmind[llm]\" \"langgraph==0.3.21\"" + ] + }, + { + "cell_type": "markdown", + "id": "dc9dea3a", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "id": "5848461e", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook.\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "id": "97d0b04b", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Agentic AI`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "id": "b279d5fa", + "metadata": {}, + "source": [ + "
Can't select this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "3606cb8c", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6ccbefc", + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2ed79cf0", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Preview the documentation template\n", + "\n", + "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", + "\n", + "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dffdaa6f", + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "id": "b5c5ba68", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Verify OpenAI API access\n", + "\n", + "Verify that a valid `OPENAI_API_KEY` is set in your `.env` file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22cc39cb", + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables if using .env file\n", + "try:\n", + " from dotenv import load_dotenv\n", + " load_dotenv()\n", + "except ImportError:\n", + " print(\"dotenv not installed. Make sure OPENAI_API_KEY is set in your environment.\")" + ] + }, + { + "cell_type": "markdown", + "id": "e4a9d3a9", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the Python environment\n", + "\n", + "Let's import all the necessary libraries to prepare for building our banking LangGraph agentic system:\n", + "\n", + "- **Standard libraries** for data handling and environment management.\n", + "- **pandas**, a Python library for data manipulation and analytics, as an alias. We'll also configure pandas to show all columns and all rows at full width for easier debugging and inspection.\n", + "- **LangChain** components for LLM integration and tool management.\n", + "- **LangGraph** for building stateful, multi-step agent workflows.\n", + "- **Banking tools** for specialized financial services as defined in [banking_tools.py](banking_tools.py)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2058d1ac", + "metadata": {}, + "outputs": [], + "source": [ + "# STANDARD LIBRARY IMPORTS\n", + "\n", + "# TypedDict: Defines type-safe dictionaries for the agent's state structure\n", + "# Annotated: Adds metadata to type hints\n", + "# Sequence: Type hint for sequences used in the agent\n", + "from typing import TypedDict, Annotated, Sequence\n", + "\n", + "# THIRD PARTY IMPORTS\n", + "\n", + "import pandas as pd\n", + "# Configure pandas to show all columns and all rows at full width\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.width', None)\n", + "pd.set_option('display.max_rows', None)\n", + "\n", + "# BaseMessage: Represents a base message in the LangChain message system\n", + "# HumanMessage: Represents a human message in the LangChain message system\n", + "# SystemMessage: Represents a system message in the LangChain message system\n", + "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", + "\n", + "# ChatOpenAI: Represents an OpenAI chat model in the LangChain library\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "# MemorySaver: Represents a checkpoint for saving and restoring agent state\n", + "from langgraph.checkpoint.memory import MemorySaver\n", + "\n", + "# StateGraph: Represents a stateful graph in the LangGraph library\n", + "# END: Represents the end of a graph\n", + "# START: Represents the start of a graph\n", + "from langgraph.graph import StateGraph, END, START\n", + "\n", + "# add_messages: Adds messages to the state\n", + "from langgraph.graph.message import add_messages\n", + "\n", + "# ToolNode: Represents a tool node in the LangGraph library\n", + "from langgraph.prebuilt import ToolNode\n", + "\n", + "# LOCAL IMPORTS FROM banking_tools.py\n", + "\n", + "from banking_tools import AVAILABLE_TOOLS" + ] + }, + { + "cell_type": "markdown", + "id": "e109d075", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Building the LangGraph agent" + ] + }, + { + "cell_type": "markdown", + "id": "15040411", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Test available banking tools\n", + "\n", + "We'll use the demo banking tools defined in `banking_tools.py` that provide use cases of financial services:\n", + "\n", + "- **Credit Risk Analyzer** - Loan applications and credit decisions\n", + "- **Customer Account Manager** - Account services and customer support\n", + "- **Fraud Detection System** - Security and fraud prevention" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e0a120c", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Available tools: {len(AVAILABLE_TOOLS)}\")\n", + "print(\"\\nTool Details:\")\n", + "for i, tool in enumerate(AVAILABLE_TOOLS, 1):\n", + " print(f\" - {tool.name}\")" + ] + }, + { + "cell_type": "markdown", + "id": "04d6785a", + "metadata": {}, + "source": [ + "Let's test each banking tool individually to ensure they're working correctly before integrating them into our agent:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc0caff2", + "metadata": {}, + "outputs": [], + "source": [ + "# Test 1: Credit Risk Analyzer\n", + "print(\"TEST 1: Credit Risk Analyzer\")\n", + "print(\"-\" * 40)\n", + "try:\n", + " # Access the underlying function using .func\n", + " credit_result = AVAILABLE_TOOLS[0].func(\n", + " customer_income=75000,\n", + " customer_debt=1200,\n", + " credit_score=720,\n", + " loan_amount=50000,\n", + " loan_type=\"personal\"\n", + " )\n", + " print(credit_result)\n", + " print(\"Credit Risk Analyzer test PASSED\")\n", + "except Exception as e:\n", + " print(f\"Credit Risk Analyzer test FAILED: {e}\")\n", + "\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6b227db", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Test 2: Customer Account Manager\n", + "print(\"TEST 2: Customer Account Manager\")\n", + "print(\"-\" * 40)\n", + "try:\n", + " # Test checking balance\n", + " account_result = AVAILABLE_TOOLS[1].func(\n", + " account_type=\"checking\",\n", + " customer_id=\"12345\",\n", + " action=\"check_balance\"\n", + " )\n", + " print(account_result)\n", + "\n", + " # Test getting account info\n", + " info_result = AVAILABLE_TOOLS[1].func(\n", + " account_type=\"all\",\n", + " customer_id=\"12345\", \n", + " action=\"get_info\"\n", + " )\n", + " print(info_result)\n", + " print(\"Customer Account Manager test PASSED\")\n", + "except Exception as e:\n", + " print(f\"Customer Account Manager test FAILED: {e}\")\n", + "\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a983b30d", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Test 3: Fraud Detection System\n", + "print(\"TEST 3: Fraud Detection System\")\n", + "print(\"-\" * 40)\n", + "try:\n", + " fraud_result = AVAILABLE_TOOLS[2].func(\n", + " transaction_id=\"TX123\",\n", + " customer_id=\"12345\",\n", + " transaction_amount=500.00,\n", + " transaction_type=\"withdrawal\",\n", + " location=\"Miami, FL\",\n", + " device_id=\"DEVICE_001\"\n", + " )\n", + " print(fraud_result)\n", + " print(\"Fraud Detection System test PASSED\")\n", + "except Exception as e:\n", + " print(f\"Fraud Detection System test FAILED: {e}\")\n", + "\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "markdown", + "id": "6bf04845", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Create LangGraph banking agent\n", + "\n", + "With our tools ready to go, we'll create our intelligent banking agent with LangGraph that automatically selects and uses the appropriate banking tool based on a user request." + ] + }, + { + "cell_type": "markdown", + "id": "31df57f0", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Define system prompt\n", + "\n", + "We'll begin by defining our system prompt, which provides the LLM with context about its role as a banking assistant and guidance on when to use each available tool:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7971c427", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Enhanced banking system prompt with tool selection guidance\n", + "system_context = \"\"\"You are a professional banking AI assistant with access to specialized banking tools.\n", + " Analyze the user's banking request and directly use the most appropriate tools to help them.\n", + " \n", + " AVAILABLE BANKING TOOLS:\n", + " \n", + " credit_risk_analyzer - Analyze credit risk for loan applications and credit decisions\n", + " - Use for: loan applications, credit assessments, risk analysis, mortgage eligibility\n", + " - Examples: \"Analyze credit risk for $50k personal loan\", \"Assess mortgage eligibility for $300k home purchase\"\n", + " - Parameters: customer_income, customer_debt, credit_score, loan_amount, loan_type\n", + "\n", + " customer_account_manager - Manage customer accounts and provide banking services\n", + " - Use for: account information, transaction processing, product recommendations, customer service\n", + " - Examples: \"Check balance for checking account 12345\", \"Recommend products for customer with high balance\"\n", + " - Parameters: account_type, customer_id, action, amount, account_details\n", + "\n", + " fraud_detection_system - Analyze transactions for potential fraud and security risks\n", + " - Use for: transaction monitoring, fraud prevention, risk assessment, security alerts\n", + " - Examples: \"Analyze fraud risk for $500 ATM withdrawal in Miami\", \"Check security for $2000 online purchase\"\n", + " - Parameters: transaction_id, customer_id, transaction_amount, transaction_type, location, device_id\n", + "\n", + " BANKING INSTRUCTIONS:\n", + " - Analyze the user's banking request carefully and identify the primary need\n", + " - If they need credit analysis → use credit_risk_analyzer\n", + " - If they need financial calculations → use financial_calculator\n", + " - If they need account services → use customer_account_manager\n", + " - If they need security analysis → use fraud_detection_system\n", + " - Extract relevant parameters from the user's request\n", + " - Provide helpful, accurate banking responses based on tool outputs\n", + " - Always consider banking regulations, risk management, and best practices\n", + " - Be professional and thorough in your analysis\n", + "\n", + " Choose and use tools wisely to provide the most helpful banking assistance.\n", + " Describe the response in user friendly manner with details describing the tool output. \n", + " Provide the response in at least 500 words.\n", + " Generate a concise execution plan for the banking request.\n", + " \"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "406835c8", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Initialize the LLM\n", + "\n", + "Let's initialize the LLM that will power our banking agent:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "866066e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the main LLM for banking responses\n", + "main_llm = ChatOpenAI(\n", + " model=\"gpt-5-mini\",\n", + " reasoning={\n", + " \"effort\": \"low\",\n", + " \"summary\": \"auto\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cce9685c", + "metadata": {}, + "source": [ + "Then bind the available banking tools to the LLM, enabling the model to automatically recognize and invoke each tool when appropriate based on request input and the system prompt we defined above:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "906d8132", + "metadata": {}, + "outputs": [], + "source": [ + "# Bind all banking tools to the main LLM\n", + "llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)" + ] + }, + { + "cell_type": "markdown", + "id": "2bad8799", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Define agent state structure\n", + "\n", + "The agent state defines the data structure that flows through the LangGraph workflow. It includes:\n", + "\n", + "- **messages** — The conversation history between the user and agent\n", + "- **user_input** — The current user request\n", + "- **session_id** — A unique identifier for the conversation session\n", + "- **context** — Additional context that can be passed between nodes\n", + "\n", + "Defining this state structure maintains the structure throughout the agent's execution and allows for multi-turn conversations with memory:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b926ddf", + "metadata": {}, + "outputs": [], + "source": [ + "# Banking Agent State Definition\n", + "class BankingAgentState(TypedDict):\n", + " messages: Annotated[Sequence[BaseMessage], add_messages]\n", + " user_input: str\n", + " session_id: str\n", + " context: dict" + ] + }, + { + "cell_type": "markdown", + "id": "47ce81b7", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Create agent workflow function\n", + "\n", + "We'll build the LangGraph agent workflow with two main components:\n", + "\n", + "1. **LLM node** — Processes user requests, applies the system prompt, and decides whether to use tools.\n", + "2. **Tools node** — Executes the selected banking tools when the LLM determines they're needed.\n", + "\n", + "The workflow begins with the LLM analyzing the request, then uses tools if needed — or ends if the response is complete, and finally returns to the LLM to generate the final response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c9bf585", + "metadata": {}, + "outputs": [], + "source": [ + "def create_banking_langgraph_agent():\n", + " \"\"\"Create a comprehensive LangGraph banking agent with intelligent tool selection.\"\"\"\n", + " def llm_node(state: BankingAgentState) -> BankingAgentState:\n", + " \"\"\"Main LLM node that processes banking requests and selects appropriate tools.\"\"\"\n", + " messages = state[\"messages\"]\n", + " # Add system context to messages\n", + " enhanced_messages = [SystemMessage(content=system_context)] + list(messages)\n", + " # Get LLM response with tool selection\n", + " response = llm_with_tools.invoke(enhanced_messages)\n", + " return {\n", + " **state,\n", + " \"messages\": messages + [response]\n", + " }\n", + " \n", + " def should_continue(state: BankingAgentState) -> str:\n", + " \"\"\"Decide whether to use tools or end the conversation.\"\"\"\n", + " last_message = state[\"messages\"][-1]\n", + " # Check if the LLM wants to use tools\n", + " if hasattr(last_message, 'tool_calls') and last_message.tool_calls:\n", + " return \"tools\"\n", + " return END\n", + " \n", + " # Create the banking state graph\n", + " workflow = StateGraph(BankingAgentState)\n", + " # Add nodes\n", + " workflow.add_node(\"llm\", llm_node)\n", + " workflow.add_node(\"tools\", ToolNode(AVAILABLE_TOOLS))\n", + " # Simplified entry point - go directly to LLM\n", + " workflow.add_edge(START, \"llm\")\n", + " # From LLM, decide whether to use tools or end\n", + " workflow.add_conditional_edges(\n", + " \"llm\",\n", + " should_continue,\n", + " {\"tools\": \"tools\", END: END}\n", + " )\n", + " # Tool execution flows back to LLM for final response\n", + " workflow.add_edge(\"tools\", \"llm\")\n", + " # Set up memory\n", + " memory = MemorySaver()\n", + " # Compile the graph\n", + " agent = workflow.compile(checkpointer=memory)\n", + " return agent" + ] + }, + { + "cell_type": "markdown", + "id": "3eb40287", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Instantiate the banking agent\n", + "\n", + "Now, we'll create an instance of the banking agent by calling the workflow creation function.\n", + "\n", + "This compiled agent is ready to process banking requests and will automatically select and use the appropriate tools based on user queries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "455b8ee4", + "metadata": {}, + "outputs": [], + "source": [ + "# Create the banking intelligent agent\n", + "banking_agent = create_banking_langgraph_agent()\n", + "\n", + "print(\"Banking LangGraph Agent Created Successfully!\")\n", + "print(\"\\nFeatures:\")\n", + "print(\" - Intelligent banking tool selection\")\n", + "print(\" - Comprehensive banking system prompt\")\n", + "print(\" - Streamlined workflow: LLM → Tools → Response\")\n", + "print(\" - Automatic tool parameter extraction\")\n", + "print(\" - Professional banking assistance\")" + ] + }, + { + "cell_type": "markdown", + "id": "12691528", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Integrate agent with ValidMind\n", + "\n", + "To integrate our LangGraph banking agent with ValidMind, we need to create a wrapper function that ValidMind can use to invoke the agent and extract the necessary information for testing and documentation, allowing ValidMind to run validation tests on the agent's behavior, tool usage, and responses." + ] + }, + { + "cell_type": "markdown", + "id": "7b78509b", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Import ValidMind components\n", + "\n", + "We'll start with importing the necessary ValidMind components for integrating our agent:\n", + "\n", + "- `Prompt` from `validmind.models` for handling prompt-based model inputs\n", + "- `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list` from `validmind.scorers.llm.deepeval` for extracting and converting tool calls from agent outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9aeb8969", + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.models import Prompt\n", + "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list" + ] + }, + { + "cell_type": "markdown", + "id": "f67f2955", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Create agent wrapper function\n", + "\n", + "We'll then create a wrapper function that:\n", + "\n", + "- Accepts input in ValidMind's expected format (with `input` and `session_id` fields)\n", + "- Invokes the banking agent with the proper state initialization\n", + "- Captures tool outputs and tool calls for evaluation\n", + "- Returns a standardized response format that includes the prediction, full output, tool messages, and tool call information\n", + "- Handles errors gracefully with fallback responses" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e4d5a82", + "metadata": {}, + "outputs": [], + "source": [ + "def banking_agent_fn(input):\n", + " \"\"\"\n", + " Invoke the banking agent with the given input.\n", + " \"\"\"\n", + " try:\n", + " # Initial state for banking agent\n", + " initial_state = {\n", + " \"user_input\": input[\"input\"],\n", + " \"messages\": [HumanMessage(content=input[\"input\"])],\n", + " \"session_id\": input[\"session_id\"],\n", + " \"context\": {}\n", + " }\n", + " session_config = {\"configurable\": {\"thread_id\": input[\"session_id\"]}}\n", + " result = banking_agent.invoke(initial_state, config=session_config)\n", + "\n", + " from utils import capture_tool_output_messages\n", + "\n", + " # Capture all tool outputs and metadata\n", + " captured_data = capture_tool_output_messages(result)\n", + " \n", + " # Access specific tool outputs, this will be used for RAGAS tests\n", + " tool_message = \"\"\n", + " for output in captured_data[\"tool_outputs\"]:\n", + " tool_message += output['content']\n", + " \n", + " tool_calls_found = []\n", + " messages = result['messages']\n", + " for message in messages:\n", + " if hasattr(message, 'tool_calls') and message.tool_calls:\n", + " for tool_call in message.tool_calls:\n", + " # Handle both dictionary and object formats\n", + " if isinstance(tool_call, dict):\n", + " tool_calls_found.append(tool_call['name'])\n", + " else:\n", + " # ToolCall object - use attribute access\n", + " tool_calls_found.append(tool_call.name)\n", + "\n", + "\n", + " return {\n", + " \"prediction\": result['messages'][-1].content[0]['text'],\n", + " \"output\": result,\n", + " \"tool_messages\": [tool_message],\n", + " # \"tool_calls\": tool_calls_found,\n", + " \"tool_called\": _convert_to_tool_call_list(extract_tool_calls_from_agent_output(result))\n", + " }\n", + " except Exception as e:\n", + " # Return a fallback response if the agent fails\n", + " error_message = f\"\"\"I apologize, but I encountered an error while processing your banking request: {str(e)}.\n", + " Please try rephrasing your question or contact support if the issue persists.\"\"\"\n", + " return {\n", + " \"prediction\": error_message, \n", + " \"output\": {\n", + " \"messages\": [HumanMessage(content=input[\"input\"]), SystemMessage(content=error_message)],\n", + " \"error\": str(e)\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "4bdc90d6", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Initialize the ValidMind model object\n", + "\n", + "We'll also need to register the banking agent as a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data.\n", + "\n", + "You simply initialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model) that:\n", + "\n", + "- Associates the wrapper function with the model for prediction\n", + "- Stores the system prompt template for documentation\n", + "- Provides a unique `input_id` for tracking and identification\n", + "- Enables the agent to be used with ValidMind's testing and documentation features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60a2ce7a", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the agent as a model\n", + "vm_banking_model = vm.init_model(\n", + " input_id=\"banking_agent_model\",\n", + " predict_fn=banking_agent_fn,\n", + " prompt=Prompt(template=system_context)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "33ed446a", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Store the agent reference\n", + "\n", + "We'll also store a reference to the original banking agent object in the ValidMind model. This allows us to access the full agent functionality directly if needed, while still maintaining the wrapper function interface for ValidMind's testing framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c653471", + "metadata": {}, + "outputs": [], + "source": [ + "# Add the banking agent to the vm model\n", + "vm_banking_model.model = banking_agent" + ] + }, + { + "cell_type": "markdown", + "id": "bf44ea16", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Verify integration\n", + "\n", + "Let's confirm that the banking agent has been successfully integrated with ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e101b0f", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Banking Agent Successfully Integrated with ValidMind!\")\n", + "print(f\"Model ID: {vm_banking_model.input_id}\")" + ] + }, + { + "cell_type": "markdown", + "id": "0c80518d", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Validate the system prompt\n", + "\n", + "Let's get an initial sense of how well our defined system prompt meets a few best practices for prompt engineering by running a few tests — we'll run evaluation tests later on our agent's performance.\n", + "\n", + "You run individual tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module. Passing in our agentic model as an input, the tests below rate the prompt on a scale of 1-10 against the following criteria:\n", + "\n", + "- **[Clarity](https://docs.validmind.ai/tests/prompt_validation/Clarity.html)** — How clearly the prompt states the task.\n", + "- **[Conciseness](https://docs.validmind.ai/tests/prompt_validation/Conciseness.html)** — How succinctly the prompt states the task.\n", + "- **[Delimitation](https://docs.validmind.ai/tests/prompt_validation/Delimitation.html)** — When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", + "- **[NegativeInstruction](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — Whether the prompt contains negative instructions.\n", + "- **[Specificity](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — How specific the prompt defines the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f52dceb1", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Clarity\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70d52333", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Conciseness\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aa89976", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Delimitation\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8630197e", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.NegativeInstruction\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bba99915", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Specificity\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "af4d6d77", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Initialize the ValidMind datasets\n", + "\n", + "After validation our system prompt, let's import our sample dataset ([banking_test_dataset.py](banking_test_dataset.py)), which we'll use in the next section to evaluate our agent's performance across different banking scenarios:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c70ca2c", + "metadata": {}, + "outputs": [], + "source": [ + "from banking_test_dataset import banking_test_dataset" + ] + }, + { + "cell_type": "markdown", + "id": "0268ce6e", + "metadata": {}, + "source": [ + "The next step is to connect your data with a ValidMind `Dataset` object. **This step is always necessary every time you want to connect a dataset to documentation and produce test results through ValidMind,** but you only need to do it once per dataset.\n", + "\n", + "Initialize a ValidMind dataset object using the [`init_dataset` function](https://docs.validmind.ai/validmind/validmind.html#init_dataset) from the ValidMind (`vm`) module. For this example, we'll pass in the following arguments:\n", + "\n", + "- **`input_id`** — A unique identifier that allows tracking what inputs are used when running each individual test.\n", + "- **`dataset`** — The raw dataset that you want to provide as input to tests.\n", + "- **`text_column`** — The name of the column containing the text input data.\n", + "- **`target_column`** — A required argument if tests require access to true values. This is the name of the target column in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7e9d158", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset = vm.init_dataset(\n", + " input_id=\"banking_test_dataset\",\n", + " dataset=banking_test_dataset,\n", + " text_column=\"input\",\n", + " target_column=\"possible_outputs\",\n", + ")\n", + "\n", + "print(\"Banking Test Dataset Initialized in ValidMind!\")\n", + "print(f\"Dataset ID: {vm_test_dataset.input_id}\")\n", + "print(f\"Dataset columns: {vm_test_dataset._df.columns}\")\n", + "vm_test_dataset._df" + ] + }, + { + "cell_type": "markdown", + "id": "b9143fb6", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign predictions\n", + "\n", + "Now that both the model object and the datasets have been registered, we'll assign predictions to capture the banking agent's responses for evaluation:\n", + "\n", + "- The [`assign_predictions()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#assign_predictions) from the `Dataset` object can link existing predictions to any number of models.\n", + "- This method links the model's class prediction values and probabilities to our `vm_train_ds` and `vm_test_ds` datasets.\n", + "\n", + "If no prediction values are passed, the method will compute predictions automatically:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d462663", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_predictions(vm_banking_model)\n", + "\n", + "print(\"Banking Agent Predictions Generated Successfully!\")\n", + "print(f\"Predictions assigned to {len(vm_test_dataset._df)} test cases\")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "8e50467e", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Running accuracy tests\n", + "\n", + "Using [`@vm.test`](https://docs.validmind.ai/validmind/validmind.html#test), let's implement some reusable custom *inline tests* to assess the accuracy of our banking agent:\n", + "\n", + "- An inline test refers to a test written and executed within the same environment as the code being tested — in this case, right in this Jupyter Notebook — without requiring a separate test file or framework.\n", + "- You'll note that the custom test functions are just regular Python functions that can include and require any Python library as you see fit." + ] + }, + { + "cell_type": "markdown", + "id": "6d8a9b90", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Response accuracy test\n", + "\n", + "We'll create a custom test that evaluates the banking agent's ability to provide accurate responses by:\n", + "\n", + "- Testing against a dataset of predefined banking questions and expected answers.\n", + "- Checking if responses contain expected keywords and banking terminology.\n", + "- Providing detailed test results including pass/fail status.\n", + "- Helping identify any gaps in the agent's banking knowledge or response quality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90232066", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "@vm.test(\"my_custom_tests.banking_accuracy_test\")\n", + "def banking_accuracy_test(model, dataset, list_of_columns):\n", + " \"\"\"\n", + " The Banking Accuracy Test evaluates whether the agent’s responses include \n", + " critical domain-specific keywords and phrases that indicate accurate, compliant,\n", + " and contextually appropriate banking information. This test ensures that the agent\n", + " provides responses containing the expected banking terminology, risk classifications,\n", + " account details, or other domain-relevant information required for regulatory compliance,\n", + " customer safety, and operational accuracy.\n", + " \"\"\"\n", + " df = dataset._df\n", + " \n", + " # Pre-compute responses for all tests\n", + " y_true = dataset.y.tolist()\n", + " y_pred = dataset.y_pred(model).tolist()\n", + "\n", + " # Vectorized test results\n", + " test_results = []\n", + " for response, keywords in zip(y_pred, y_true):\n", + " # Convert keywords to list if not already a list\n", + " if not isinstance(keywords, list):\n", + " keywords = [keywords]\n", + " test_results.append(any(str(keyword).lower() in str(response).lower() for keyword in keywords))\n", + " \n", + " results = pd.DataFrame()\n", + " column_names = [col + \"_details\" for col in list_of_columns]\n", + " results[column_names] = df[list_of_columns]\n", + " results[\"actual\"] = y_pred\n", + " results[\"expected\"] = y_true\n", + " results[\"passed\"] = test_results\n", + " results[\"error\"] = None if test_results else f'Response did not contain any expected keywords: {y_true}'\n", + " \n", + " return results" + ] + }, + { + "cell_type": "markdown", + "id": "7eed5265", + "metadata": {}, + "source": [ + "Now that we've defined our custom response accuracy test, we can run the test using the same `run_test()` function we used earlier to validate the system prompt using our sample dataset and agentic model as input, and log the test results to the ValidMind Platform with the [`log()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#log):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e68884d5", + "metadata": {}, + "outputs": [], + "source": [ + "result = vm.tests.run_test(\n", + " \"my_custom_tests.banking_accuracy_test\",\n", + " inputs={\n", + " \"dataset\": vm_test_dataset,\n", + " \"model\": vm_banking_model\n", + " },\n", + " params={\n", + " \"list_of_columns\": [\"input\"]\n", + " }\n", + ")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "id": "4d758ddf", + "metadata": {}, + "source": [ + "Let's review the first five rows of the test dataset to inspect the results to see how well the banking agent performed. Each column in the output serves a specific purpose in evaluating agent performance:\n", + "\n", + "| Column header | Description | Importance |\n", + "|--------------|-------------|------------|\n", + "| **`input`** | Original user query or request | Essential for understanding the context of each test case and tracing which inputs led to specific agent behaviors. |\n", + "| **`expected_tools`** | Banking tools that should be invoked for this request | Enables validation of correct tool selection, which is critical for agentic AI systems where choosing the right tool is a key success metric. |\n", + "| **`expected_output`** | Expected output or keywords that should appear in the response | Defines the success criteria for each test case, enabling objective evaluation of whether the agent produced the correct result. |\n", + "| **`session_id`** | Unique identifier for each test session | Allows tracking and correlation of related test runs, debugging specific sessions, and maintaining audit trails. |\n", + "| **`category`** | Classification of the request type | Helps organize test results by domain and identify performance patterns across different banking use cases. |\n", + "| **`banking_agent_model_output`** | Complete agent response including all messages and reasoning | Allows you to examine the full output to assess response quality, completeness, and correctness beyond just keyword matching. |\n", + "| **`banking_agent_model_tool_messages`** | Messages exchanged with the banking tools | Critical for understanding how the agent interacted with tools, what parameters were passed, and what tool outputs were received. |\n", + "| **`banking_agent_model_tool_called`** | Specific tool that was invoked | Enables validation that the agent selected the correct tool for each request, which is fundamental to agentic AI validation. |\n", + "| **`possible_outputs`** | Alternative valid outputs or keywords that could appear in the response | Provides flexibility in evaluation by accounting for multiple acceptable response formats or variations. |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78f7edb1", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.df.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "6f233bef", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Tool selection accuracy test\n", + "\n", + "We'll also create a custom test that evaluates the banking agent's ability to select the correct tools for different requests by:\n", + "\n", + "- Testing against a dataset of predefined banking queries with expected tool selections.\n", + "- Comparing the tools actually invoked by the agent against the expected tools for each request.\n", + "- Providing quantitative accuracy scores that measure the proportion of expected tools correctly selected.\n", + "- Helping identify gaps in the agent's understanding of user needs and tool selection logic." + ] + }, + { + "cell_type": "markdown", + "id": "d0b46111", + "metadata": {}, + "source": [ + "First, we'll define a helper function that extracts tool calls from the agent's messages and compares them against the expected tools. This function handles different message formats (dictionary or object) and calculates accuracy scores:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e68798be", + "metadata": {}, + "outputs": [], + "source": [ + "def validate_tool_calls_simple(messages, expected_tools):\n", + " \"\"\"Simple validation of tool calls without RAGAS dependency issues.\"\"\"\n", + " \n", + " tool_calls_found = []\n", + " \n", + " for message in messages:\n", + " if hasattr(message, 'tool_calls') and message.tool_calls:\n", + " for tool_call in message.tool_calls:\n", + " # Handle both dictionary and object formats\n", + " if isinstance(tool_call, dict):\n", + " tool_calls_found.append(tool_call['name'])\n", + " else:\n", + " # ToolCall object - use attribute access\n", + " tool_calls_found.append(tool_call.name)\n", + " \n", + " # Check if expected tools were called\n", + " accuracy = 0.0\n", + " matches = 0\n", + " if expected_tools:\n", + " matches = sum(1 for tool in expected_tools if tool in tool_calls_found)\n", + " accuracy = matches / len(expected_tools)\n", + " \n", + " return {\n", + " 'expected_tools': expected_tools,\n", + " 'found_tools': tool_calls_found,\n", + " 'matches': matches,\n", + " 'total_expected': len(expected_tools) if expected_tools else 0,\n", + " 'accuracy': accuracy,\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "1b45472c", + "metadata": {}, + "source": [ + "Now we'll define the main test function that uses the helper function to evaluate tool selection accuracy across all test cases in the dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "604d7313", + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.BankingToolCallAccuracy\")\n", + "def BankingToolCallAccuracy(dataset, agent_output_column, expected_tools_column):\n", + " \"\"\"\n", + " Evaluates the tool selection accuracy of a LangGraph-powered banking agent.\n", + "\n", + " This test measures whether the agent correctly identifies and invokes the required banking tools\n", + " for each user query scenario.\n", + " For each case, the outputs generated by the agent (including its tool calls) are compared against an\n", + " expected set of tools. The test considers both coverage and exactness: it computes the proportion of\n", + " expected tools correctly called by the agent for each instance.\n", + "\n", + " Parameters:\n", + " dataset (VMDataset): The dataset containing user queries, agent outputs, and ground-truth tool expectations.\n", + " agent_output_column (str): Dataset column name containing agent outputs (should include tool call details in 'messages').\n", + " expected_tools_column (str): Dataset column specifying the true expected tools (as lists).\n", + "\n", + " Returns:\n", + " List[dict]: Per-row dictionaries with details: expected tools, found tools, match count, total expected, and accuracy score.\n", + "\n", + " Purpose:\n", + " Provides diagnostic evidence of the banking agent's core reasoning ability—specifically, its capacity to\n", + " interpret user needs and select the correct banking actions. Useful for diagnosing gaps in tool coverage,\n", + " misclassifications, or breakdowns in agent logic.\n", + "\n", + " Interpretation:\n", + " - An accuracy of 1.0 signals perfect tool selection for that example.\n", + " - Lower scores may indicate partial or complete failures to invoke required tools.\n", + " - Review 'found_tools' vs. 'expected_tools' to understand the source of discrepancies.\n", + "\n", + " Strengths:\n", + " - Directly tests a core capability of compositional tool-use agents.\n", + " - Framework-agnostic; robust to tool call output format (object or dict).\n", + " - Supports batch validation and result logging for systematic documentation.\n", + "\n", + " Limitations:\n", + " - Does not penalize extra, unnecessary tool calls.\n", + " - Does not assess result quality—only correct invocation.\n", + "\n", + " \"\"\"\n", + " df = dataset._df\n", + " \n", + " results = []\n", + " for i, row in df.iterrows():\n", + " result = validate_tool_calls_simple(row[agent_output_column]['messages'], row[expected_tools_column])\n", + " results.append(result)\n", + " \n", + " return results" + ] + }, + { + "cell_type": "markdown", + "id": "d594c973", + "metadata": {}, + "source": [ + "Finally, we can call our function with `run_test()` and log the test results to the ValidMind Platform:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd14115e", + "metadata": {}, + "outputs": [], + "source": [ + "result = vm.tests.run_test(\n", + " \"my_custom_tests.BankingToolCallAccuracy\",\n", + " inputs={\n", + " \"dataset\": vm_test_dataset,\n", + " },\n", + " params={\n", + " \"agent_output_column\": \"banking_agent_model_output\",\n", + " \"expected_tools_column\": \"expected_tools\"\n", + " }\n", + ")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "id": "f78f4107", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Assigning AI evaluation metric scores\n", + "\n", + "*AI agent evaluation metrics* are specialized measurements designed to assess how well autonomous LLM-based agents reason, plan, select and execute tools, and ultimately complete user tasks by analyzing the *full execution trace* — including reasoning steps, tool calls, intermediate decisions, and outcomes, rather than just single input–output pairs. These metrics are essential because agent failures often occur in ways traditional LLM metrics miss — for example, choosing the right tool with wrong arguments, creating a good plan but not following it, or completing a task inefficiently.\n", + "\n", + "In this section, we'll evaluate our banking agent's outputs and add scoring to our sample dataset against metrics defined in [DeepEval’s AI agent evaluation framework](https://deepeval.com/guides/guides-ai-agent-evaluation-metrics) which breaks down AI agent evaluation into three layers with corresponding subcategories: **reasoning**, **action**, and **execution**.\n", + "\n", + "Together, these three metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." + ] + }, + { + "cell_type": "markdown", + "id": "3a9c853a", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Identify relevant DeepEval scorers\n", + "\n", + "*Scorers* are evaluation metrics that analyze model outputs and store their results in the dataset:\n", + "\n", + "- Each scorer adds a new column to the dataset with format: `{scorer_name}_{metric_name}`\n", + "- The column contains the numeric score (typically `0`-`1`) for each example\n", + "- Multiple scorers can be run on the same dataset, each adding their own column\n", + "- Scores are persisted in the dataset for later analysis and visualization\n", + "- Common scorer patterns include:\n", + " - Model performance metrics (accuracy, F1, etc.)\n", + " - Output quality metrics (relevance, faithfulness)\n", + " - Task-specific metrics (completion, correctness)\n", + "\n", + "Use `list_scorers()` from [`validmind.scorers`](https://docs.validmind.ai/validmind/validmind/tests.html#scorer) to discover all available scoring methods and their IDs that can be used with `assign_scores()`. We'll filter these results to return only DeepEval scorers for our desired three metrics in a formatted table with descriptions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "730c70ec", + "metadata": {}, + "outputs": [], + "source": [ + "# Load all DeepEval scorers\n", + "llm_scorers_dict = vm.tests.load._load_tests([s for s in vm.scorer.list_scorers() if \"deepeval\" in s.lower()])\n", + "\n", + "# Categorize scorers by metric layer\n", + "reasoning_scorers = {}\n", + "action_scorers = {}\n", + "execution_scorers = {}\n", + "\n", + "for scorer_id, scorer_func in llm_scorers_dict.items():\n", + " tags = getattr(scorer_func, \"__tags__\", [])\n", + " scorer_name = scorer_id.split(\".\")[-1]\n", + "\n", + " if \"reasoning_layer\" in tags:\n", + " reasoning_scorers[scorer_id] = scorer_func\n", + " elif \"action_layer\" in tags:\n", + " # StepEfficiency is tagged as action_layer but belongs to execution per DeepEval framework\n", + " if \"StepEfficiency\" in scorer_name:\n", + " execution_scorers[scorer_id] = scorer_func\n", + " else:\n", + " action_scorers[scorer_id] = scorer_func\n", + " elif \"TaskCompletion\" in scorer_name:\n", + " execution_scorers[scorer_id] = scorer_func\n", + "\n", + "# Display scorers by category\n", + "print(\"=\" * 80)\n", + "print(\"REASONING LAYER\")\n", + "print(\"=\" * 80)\n", + "if reasoning_scorers:\n", + " reasoning_df = vm.tests.load._pretty_list_tests(reasoning_scorers, truncate=True)\n", + " display(reasoning_df)\n", + "else:\n", + " print(\"No reasoning layer scorers found.\")\n", + "\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\"ACTION LAYER\")\n", + "print(\"=\" * 80)\n", + "if action_scorers:\n", + " action_df = vm.tests.load._pretty_list_tests(action_scorers, truncate=True)\n", + " display(action_df)\n", + "else:\n", + " print(\"No action layer scorers found.\")\n", + "\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\"EXECUTION LAYER\")\n", + "print(\"=\" * 80)\n", + "if execution_scorers:\n", + " execution_df = vm.tests.load._pretty_list_tests(execution_scorers, truncate=True)\n", + " display(execution_df)\n", + "else:\n", + " print(\"No execution layer scorers found.\")" + ] + }, + { + "cell_type": "markdown", + "id": "4dd73d0d", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign reasoning scores\n", + "\n", + "*Reasoning* evaluates planning and strategy generation:\n", + "\n", + "- **Plan quality** – How logical, complete, and efficient the agent’s plan is.\n", + "- **Plan adherence** – Whether the agent follows its own plan during execution." + ] + }, + { + "cell_type": "markdown", + "id": "06ccae28", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Plan quality score\n", + "\n", + "Let's measure how well our banking agent generates a plan before acting. A high score means the plan is logical, complete, and efficient." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52f362ba", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.PlanQuality\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "8dcdc88f", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Plan adherence score\n", + "\n", + "Let's check whether our banking agent follows the plan it created. Deviations lower this score and indicate gaps between reasoning and execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4124a7c2", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.PlanAdherence\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " expected_output_column = \"expected_output\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "6da1ac95", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign action scores\n", + "\n", + "*Action* assesses tool usage and argument generation:\n", + "\n", + "- **Tool correctness** – Whether the agent selects and calls the right tools.\n", + "- **Argument correctness** – Whether the agent generates correct tool arguments." + ] + }, + { + "cell_type": "markdown", + "id": "d4db8270", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Tool correctness score\n", + "\n", + "Let's evaluate if our banking agent selects the appropriate tool for the task. Choosing the wrong tool reduces performance even if reasoning was correct." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d2e8a25", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.ToolCorrectness\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " expected_tools_column = \"expected_tools\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "9aa50b05", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Argument correctness score\n", + "\n", + "Let's assesses whether our banking agent provides correct inputs or arguments to the selected tool. Incorrect arguments can lead to failed or unexpected results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04f90489", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.ArgumentCorrectness\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c59e5595", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign execution scores\n", + "\n", + "*Execution* measures end-to-end performance:\n", + "\n", + "- **Task completion** – Whether the agent successfully completes the intended task.\n", + "- **Step efficiency** – Whether the agent avoids unnecessary or redundant steps." + ] + }, + { + "cell_type": "markdown", + "id": "d64600ca", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Task completion score\n", + "\n", + "Let's evaluate whether our banking agent successfully completes the requested tasks. Incomplete task execution can lead to user dissatisfaction and failed banking operations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05024f1f", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.TaskCompletion\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "21aa9b0d", + "metadata": {}, + "source": [ + "As you recall from the beginning of this section, when we run scorers through `assign_scores()`, the return values are automatically processed and added as new columns with the format `{scorer_name}_{metric_name}`. Note that the task completion scorer has added a new column `TaskCompletion_score` to our dataset.\n", + "\n", + "We'll use this column to visualize the distribution of task completion scores across our test cases through the [BoxPlot test](https://docs.validmind.ai/validmind/validmind/tests/plots/BoxPlot.html#boxplot):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f6d08ca", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.plots.BoxPlot\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " params={\n", + " \"columns\": \"TaskCompletion_score\",\n", + " \"title\": \"Distribution of Task Completion Scores\",\n", + " \"ylabel\": \"Score\",\n", + " \"figsize\": (8, 6)\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "5a6f6042", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Step efficiency score\n", + "\n", + "Let's evaluate whether our banking agent avoids unnecessary or redundant steps during task execution. Inefficient step sequences can lead to increased latency, higher costs, and poor user experience." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa6e154a", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.StepEfficiency\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "012bbcb8", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Running RAGAS tests\n", + "\n", + "Next, let's run some out-of-the-box *Retrieval-Augmented Generation Assessment* (RAGAS) tests available in the ValidMind Library. RAGAS provides specialized metrics for evaluating retrieval-augmented generation systems and conversational AI agents. These metrics analyze different aspects of agent performance by assessing how well systems integrate retrieved information with generated responses.\n", + "\n", + "Our banking agent uses tools to retrieve information and generates responses based on that context, making it similar to a RAG system. RAGAS metrics help evaluate the quality of this integration by analyzing the relationship between retrieved tool outputs, user queries, and generated responses.\n", + "\n", + "These tests provide insights into how well our banking agent integrates tool usage with conversational abilities, ensuring it provides accurate, relevant, and helpful responses to banking users while maintaining fidelity to retrieved information." + ] + }, + { + "cell_type": "markdown", + "id": "2036afba", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Identify relevant RAGAS tests\n", + "\n", + "Let's explore some of ValidMind's available tests. Using ValidMind’s repository of tests streamlines your development testing, and helps you ensure that your models are being documented and evaluated appropriately.\n", + "\n", + "You can pass `tasks` and `tags` as parameters to the [`vm.tests.list_tests()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tests) to filter the tests based on the tags and task types:\n", + "\n", + "- **`tasks`** represent the kind of modeling task associated with a test. Here we'll focus on `text_qa` tasks.\n", + "- **`tags`** are free-form descriptions providing more details about the test, for example, what category the test falls into. Here we'll focus on the `ragas` tag.\n", + "\n", + "We'll then run three of these tests returned as examples below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0701f5a9", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.list_tests(task=\"text_qa\", tags=[\"ragas\"])" + ] + }, + { + "cell_type": "markdown", + "id": "c1741ffc", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Faithfulness\n", + "\n", + "Let's evaluate whether the banking agent's responses accurately reflect the information retrieved from tools. Unfaithful responses can misreport credit analysis, financial calculations, and compliance results—undermining user trust in the banking agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92044533", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.ragas.Faithfulness\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " param_grid={\n", + " \"user_input_column\": [\"input\"],\n", + " \"response_column\": [\"banking_agent_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "42b71ccc", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Response Relevancy\n", + "\n", + "Let's evaluate whether the banking agent's answers address the user's original question or request. Irrelevant or off-topic responses can frustrate users and fail to deliver the banking information they need." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7483bc3", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " params={\n", + " \"user_input_column\": \"input\",\n", + " \"response_column\": \"banking_agent_model_prediction\",\n", + " \"retrieved_contexts_column\": \"banking_agent_model_tool_messages\",\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "4f4d0569", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Context Recall\n", + "\n", + "Let's evaluate how well the banking agent uses the information retrieved from tools when generating its responses. Poor context recall can lead to incomplete or underinformed answers even when the right tools were selected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5dc00ce", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.ragas.ContextRecall\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " param_grid={\n", + " \"user_input_column\": [\"input\"],\n", + " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", + " \"reference_column\": [\"banking_agent_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "b987b00e", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Running safety tests\n", + "\n", + "Finally, let's run some out-of-the-box *safety* tests available in the ValidMind Library. Safety tests provide specialized metrics for evaluating whether AI agents operate reliably and securely. These metrics analyze different aspects of agent behavior by assessing adherence to safety guidelines, consistency of outputs, and resistance to harmful or inappropriate requests.\n", + "\n", + "Our banking agent handles sensitive financial information and user requests, making safety and reliability essential. Safety tests help evaluate whether the agent maintains appropriate boundaries, responds consistently and correctly to inputs, and avoids generating harmful, biased, or unprofessional content.\n", + "\n", + "These tests provide insights into how well our banking agent upholds standards of fairness and professionalism, ensuring it operates reliably and securely for banking users." + ] + }, + { + "cell_type": "markdown", + "id": "a754cca3", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### AspectCritic\n", + "\n", + "Let's evaluate our banking agent's responses across multiple quality dimensions — conciseness, coherence, correctness, harmfulness, and maliciousness. Weak performance on these dimensions can degrade user experience, fall short of professional banking standards, or introduce safety risks. \n", + "\n", + "We'll use the `AspectCritic` we identified earlier:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "148daa2b", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.ragas.AspectCritic\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " param_grid={\n", + " \"user_input_column\": [\"input\"],\n", + " \"response_column\": [\"banking_agent_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "92e5b1f6", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Bias\n", + "\n", + "Let's evaluate whether our banking agent's prompts contain unintended biases that could affect banking decisions. Biased prompts can lead to unfair or discriminatory outcomes — undermining customer trust and exposing the institution to compliance risk.\n", + "\n", + "We'll first use `list_tests()` again to filter for tests relating to `prompt_validation`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74eba86c", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.list_tests(filter=\"prompt_validation\")" + ] + }, + { + "cell_type": "markdown", + "id": "bcc66b65", + "metadata": {}, + "source": [ + "And then run the identified `Bias` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "062cf8e7", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Bias\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "a2832750", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps\n", + "\n", + "You can look at the output produced by the ValidMind Library right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation." + ] + }, + { + "cell_type": "markdown", + "id": "a8cb1a58", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Work with your model documentation\n", + "\n", + "1. From the **Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", + "\n", + "2. In the left sidebar that appears for your model, click **Documentation** under Documents.\n", + "\n", + " What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)\n", + "\n", + "3. Click into any section related to the tests we ran in this notebook, for example: **4.3. Prompt Evaluation** to review the results of the tests we logged." + ] + }, + { + "cell_type": "markdown", + "id": "94ef26be", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Customize the banking agent for your use case\n", + "\n", + "You've now built an agentic AI system designed for banking use cases that supports compliance with supervisory guidance such as SR 11-7 and SS1/23, covering credit and fraud risk assessment for both retail and commercial banking. Extend this example agent to real-world banking scenarios and production deployment by:\n", + "\n", + "- Adapting the banking tools to your organization's specific requirements\n", + "- Adding more banking scenarios and edge cases to your test set\n", + "- Connecting the agent to your banking systems and databases\n", + "- Implementing additional banking-specific tools and workflows" + ] + }, + { + "cell_type": "markdown", + "id": "a681e49c", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Discover more learning resources\n", + "\n", + "Learn more about the ValidMind Library tools we used in this notebook:\n", + "\n", + "- [Custom prompts](https://docs.validmind.ai/notebooks/how_to/customize_test_result_descriptions.html)\n", + "- [Custom tests](https://docs.validmind.ai/notebooks/code_samples/custom_tests/implement_custom_tests.html)\n", + "- [ValidMind scorers](https://docs.validmind.ai/notebooks/how_to/assign_scores_complete_tutorial.html)\n", + "\n", + "We also offer many more interactive notebooks to help you document models:\n", + "\n", + "- [Run tests & test suites](https://docs.validmind.ai/guide/testing-overview.html)\n", + "- [Code samples](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "id": "707c1b6e", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9733adff", + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "id": "e4b0b646", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "387fa7f1", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-de4baf0f42ba4a37946d52586dff1049", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ValidMind Library", + "language": "python", + "name": "validmind" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } From 8ea7f009ada7a5f69a428e97c0b59ddda46356f2 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Wed, 28 Jan 2026 15:26:37 -0800 Subject: [PATCH 52/54] Removing whitespaces from StepEfficiency.py --- .../scorers/llm/deepeval/StepEfficiency.py | 44 ++++++++++--------- validmind/tests/__types__.py | 1 + 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/validmind/scorers/llm/deepeval/StepEfficiency.py b/validmind/scorers/llm/deepeval/StepEfficiency.py index be2e2406c..4b6346770 100644 --- a/validmind/scorers/llm/deepeval/StepEfficiency.py +++ b/validmind/scorers/llm/deepeval/StepEfficiency.py @@ -101,20 +101,20 @@ def StepEfficiency( from validmind.scorers.llm.deepeval import _convert_to_tool_call_list tools_called_value = _convert_to_tool_call_list(tools_called_value) - + trace_dict = row.get(agent_output_column, {}) - + # StepEfficiencyMetric requires a properly structured trace # Ensure trace_dict has the necessary structure if not isinstance(trace_dict, dict): trace_dict = {} - + # Ensure trace_dict has 'input' and 'output' for task extraction if "input" not in trace_dict: trace_dict["input"] = input_value if "output" not in trace_dict: trace_dict["output"] = actual_output_value - + test_case = LLMTestCase( input=input_value, actual_output=actual_output_value, @@ -133,25 +133,29 @@ def StepEfficiency( # This can happen if the trace doesn't contain the required execution steps error_msg = str(e) if "prompt" in error_msg or "referenced before assignment" in error_msg: - results.append({ - "score": 0.0, - "reason": ( - f"StepEfficiency evaluation failed: The agent trace may not contain " - f"sufficient execution steps for analysis. StepEfficiencyMetric requires " - f"a complete execution trace with step-by-step actions. " - f"Original error: {error_msg}" - ) - }) + results.append( + { + "score": 0.0, + "reason": ( + f"StepEfficiency evaluation failed: The agent trace may not contain " + f"sufficient execution steps for analysis. StepEfficiencyMetric requires " + f"a complete execution trace with step-by-step actions. " + f"Original error: {error_msg}" + ), + } + ) else: raise except Exception as e: # Handle other potential errors gracefully - results.append({ - "score": 0.0, - "reason": ( - f"StepEfficiency evaluation failed: {str(e)}. " - f"This metric requires a properly structured agent execution trace." - ) - }) + results.append( + { + "score": 0.0, + "reason": ( + f"StepEfficiency evaluation failed: {str(e)}. " + f"This metric requires a properly structured agent execution trace." + ), + } + ) return results diff --git a/validmind/tests/__types__.py b/validmind/tests/__types__.py index d936d38f6..17bffdcbe 100644 --- a/validmind/tests/__types__.py +++ b/validmind/tests/__types__.py @@ -219,6 +219,7 @@ "validmind.scorers.llm.deepeval.Hallucination", "validmind.scorers.llm.deepeval.PlanAdherence", "validmind.scorers.llm.deepeval.PlanQuality", + "validmind.scorers.llm.deepeval.StepEfficiency", "validmind.scorers.llm.deepeval.Summarization", "validmind.scorers.llm.deepeval.TaskCompletion", "validmind.scorers.llm.deepeval.ToolCorrectness", From a9bb8933bbf5b43c7352122ce6a08f23ce7f6f05 Mon Sep 17 00:00:00 2001 From: Beck <164545837+validbeck@users.noreply.github.com> Date: Thu, 29 Jan 2026 11:16:08 -0800 Subject: [PATCH 53/54] StepEfficiency.py fix test 2 --- .../scorers/llm/deepeval/StepEfficiency.py | 157 ++++++++++-------- 1 file changed, 86 insertions(+), 71 deletions(-) diff --git a/validmind/scorers/llm/deepeval/StepEfficiency.py b/validmind/scorers/llm/deepeval/StepEfficiency.py index 4b6346770..41a62f5aa 100644 --- a/validmind/scorers/llm/deepeval/StepEfficiency.py +++ b/validmind/scorers/llm/deepeval/StepEfficiency.py @@ -26,6 +26,86 @@ raise e +def _validate_step_efficiency_columns( + dataset: VMDataset, + input_column: str, + actual_output_column: str, +) -> None: + """Validate required columns exist; raise ValueError if any are missing.""" + missing_columns: List[str] = [] + if input_column not in dataset._df.columns: + missing_columns.append(input_column) + if actual_output_column not in dataset._df.columns: + missing_columns.append(actual_output_column) + if missing_columns: + raise ValueError( + f"Required columns {missing_columns} not found in dataset. " + f"Available columns: {dataset._df.columns.tolist()}" + ) + + +def _normalize_tools_called(tools_called_value: Any) -> List[ToolCall]: + """Return tools_called as a list of ToolCall; convert if needed.""" + if isinstance(tools_called_value, list) and all( + isinstance(tool, ToolCall) for tool in tools_called_value + ): + return tools_called_value + from validmind.scorers.llm.deepeval import _convert_to_tool_call_list + + return _convert_to_tool_call_list(tools_called_value) + + +def _prepare_trace_dict( + row: Any, + agent_output_column: str, + input_value: Any, + actual_output_value: Any, +) -> Dict[str, Any]: + """Build trace dict from row with required 'input' and 'output' keys.""" + trace_dict = row.get(agent_output_column, {}) + if not isinstance(trace_dict, dict): + trace_dict = {} + if "input" not in trace_dict: + trace_dict["input"] = input_value + if "output" not in trace_dict: + trace_dict["output"] = actual_output_value + return trace_dict + + +def _evaluate_single_step_efficiency( + test_case: LLMTestCase, + metric: StepEfficiencyMetric, +) -> Dict[str, Any]: + """Run StepEfficiencyMetric on one test case; return score and reason or fallback.""" + try: + result = evaluate(test_cases=[test_case], metrics=[metric]) + metric_data = result.test_results[0].metrics_data[0] + score = metric_data.score + reason = getattr(metric_data, "reason", "No reason provided") + return {"score": score, "reason": reason} + except (UnboundLocalError, AttributeError, KeyError) as e: + error_msg = str(e) + if "prompt" in error_msg or "referenced before assignment" in error_msg: + return { + "score": 0.0, + "reason": ( + f"StepEfficiency evaluation failed: The agent trace may not contain " + f"sufficient execution steps for analysis. StepEfficiencyMetric requires " + f"a complete execution trace with step-by-step actions. " + f"Original error: {error_msg}" + ), + } + raise + except Exception as e: + return { + "score": 0.0, + "reason": ( + f"StepEfficiency evaluation failed: {str(e)}. " + f"This metric requires a properly structured agent execution trace." + ), + } + + @scorer() @tags("llm", "deepeval", "agent_evaluation", "action_layer", "agentic") @tasks("llm") @@ -66,22 +146,9 @@ def StepEfficiency( Raises: ValueError: If required columns are missing """ - # Validate required columns exist in dataset - missing_columns: List[str] = [] - if input_column not in dataset._df.columns: - missing_columns.append(input_column) - - if actual_output_column not in dataset._df.columns: - missing_columns.append(actual_output_column) - - if missing_columns: - raise ValueError( - f"Required columns {missing_columns} not found in dataset. " - f"Available columns: {dataset._df.columns.tolist()}" - ) + _validate_step_efficiency_columns(dataset, input_column, actual_output_column) _, model = get_client_and_model() - metric = StepEfficiencyMetric( threshold=threshold, model=model, @@ -94,68 +161,16 @@ def StepEfficiency( for _, row in dataset._df.iterrows(): input_value = row[input_column] actual_output_value = row[actual_output_column] - tools_called_value = row.get(tools_called_column, []) - if not isinstance(tools_called_value, list) or not all( - isinstance(tool, ToolCall) for tool in tools_called_value - ): - from validmind.scorers.llm.deepeval import _convert_to_tool_call_list - - tools_called_value = _convert_to_tool_call_list(tools_called_value) - - trace_dict = row.get(agent_output_column, {}) - - # StepEfficiencyMetric requires a properly structured trace - # Ensure trace_dict has the necessary structure - if not isinstance(trace_dict, dict): - trace_dict = {} - - # Ensure trace_dict has 'input' and 'output' for task extraction - if "input" not in trace_dict: - trace_dict["input"] = input_value - if "output" not in trace_dict: - trace_dict["output"] = actual_output_value - + tools_called_value = _normalize_tools_called(row.get(tools_called_column, [])) + trace_dict = _prepare_trace_dict( + row, agent_output_column, input_value, actual_output_value + ) test_case = LLMTestCase( input=input_value, actual_output=actual_output_value, tools_called=tools_called_value, _trace_dict=trace_dict, ) - - try: - result = evaluate(test_cases=[test_case], metrics=[metric]) - metric_data = result.test_results[0].metrics_data[0] - score = metric_data.score - reason = getattr(metric_data, "reason", "No reason provided") - results.append({"score": score, "reason": reason}) - except (UnboundLocalError, AttributeError, KeyError) as e: - # StepEfficiencyMetric may fail if trace structure is incomplete - # This can happen if the trace doesn't contain the required execution steps - error_msg = str(e) - if "prompt" in error_msg or "referenced before assignment" in error_msg: - results.append( - { - "score": 0.0, - "reason": ( - f"StepEfficiency evaluation failed: The agent trace may not contain " - f"sufficient execution steps for analysis. StepEfficiencyMetric requires " - f"a complete execution trace with step-by-step actions. " - f"Original error: {error_msg}" - ), - } - ) - else: - raise - except Exception as e: - # Handle other potential errors gracefully - results.append( - { - "score": 0.0, - "reason": ( - f"StepEfficiency evaluation failed: {str(e)}. " - f"This metric requires a properly structured agent execution trace." - ), - } - ) + results.append(_evaluate_single_step_efficiency(test_case, metric)) return results From 94ff651464b74863c01e58008ba6eb229f82bc96 Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Mon, 2 Feb 2026 14:30:24 +0000 Subject: [PATCH 54/54] remove stepefficiency test and it's references --- .../agents/document_agentic_ai.ipynb | 48 +---- .../scorers/llm/deepeval/StepEfficiency.py | 176 ------------------ validmind/scorers/llm/deepeval/__init__.py | 2 - validmind/tests/__types__.py | 1 - 4 files changed, 6 insertions(+), 221 deletions(-) delete mode 100644 validmind/scorers/llm/deepeval/StepEfficiency.py diff --git a/notebooks/code_samples/agents/document_agentic_ai.ipynb b/notebooks/code_samples/agents/document_agentic_ai.ipynb index d5203a92f..bffd3863f 100644 --- a/notebooks/code_samples/agents/document_agentic_ai.ipynb +++ b/notebooks/code_samples/agents/document_agentic_ai.ipynb @@ -69,7 +69,6 @@ " - [Argument correctness score](#toc6_3_2__) \n", " - [Assign execution scores](#toc6_4__) \n", " - [Task completion score](#toc6_4_1__) \n", - " - [Step efficiency score](#toc6_4_2__) \n", "- [Running RAGAS tests](#toc7__) \n", " - [Identify relevant RAGAS tests](#toc7_1__) \n", " - [Faithfulness](#toc7_1_1__) \n", @@ -1555,11 +1554,7 @@ " if \"reasoning_layer\" in tags:\n", " reasoning_scorers[scorer_id] = scorer_func\n", " elif \"action_layer\" in tags:\n", - " # StepEfficiency is tagged as action_layer but belongs to execution per DeepEval framework\n", - " if \"StepEfficiency\" in scorer_name:\n", - " execution_scorers[scorer_id] = scorer_func\n", - " else:\n", - " action_scorers[scorer_id] = scorer_func\n", + " action_scorers[scorer_id] = scorer_func\n", " elif \"TaskCompletion\" in scorer_name:\n", " execution_scorers[scorer_id] = scorer_func\n", "\n", @@ -1750,12 +1745,11 @@ "source": [ "\n", "\n", - "### Assign execution scores\n", + "### Assign execution score\n", "\n", "*Execution* measures end-to-end performance:\n", "\n", - "- **Task completion** – Whether the agent successfully completes the intended task.\n", - "- **Step efficiency** – Whether the agent avoids unnecessary or redundant steps." + "- **Task completion** – Whether the agent successfully completes the intended task.\n" ] }, { @@ -1817,36 +1811,6 @@ ").log()" ] }, - { - "cell_type": "markdown", - "id": "5a6f6042", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Step efficiency score\n", - "\n", - "Let's evaluate whether our banking agent avoids unnecessary or redundant steps during task execution. Inefficient step sequences can lead to increased latency, higher costs, and poor user experience." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa6e154a", - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.StepEfficiency\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, { "cell_type": "markdown", "id": "012bbcb8", @@ -2210,9 +2174,9 @@ ], "metadata": { "kernelspec": { - "display_name": "ValidMind Library", + "display_name": "validmind-1QuffXMV-py3.11", "language": "python", - "name": "validmind" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -2224,7 +2188,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/validmind/scorers/llm/deepeval/StepEfficiency.py b/validmind/scorers/llm/deepeval/StepEfficiency.py deleted file mode 100644 index 41a62f5aa..000000000 --- a/validmind/scorers/llm/deepeval/StepEfficiency.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright © 2023-2026 ValidMind Inc. All rights reserved. -# Refer to the LICENSE file in the root of this repository for details. -# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial - -from typing import Any, Dict, List - -from validmind import tags, tasks -from validmind.ai.utils import get_client_and_model -from validmind.errors import MissingDependencyError -from validmind.tests.decorator import scorer -from validmind.vm_models.dataset import VMDataset - -try: - from deepeval import evaluate - from deepeval.metrics import StepEfficiencyMetric - from deepeval.test_case import LLMTestCase, ToolCall -except ImportError as e: - if "deepeval" in str(e): - raise MissingDependencyError( - "Missing required package `deepeval` for StepEfficiency. " - "Please run `pip install validmind[llm]` to use LLM tests", - required_dependencies=["deepeval"], - extra="llm", - ) from e - - raise e - - -def _validate_step_efficiency_columns( - dataset: VMDataset, - input_column: str, - actual_output_column: str, -) -> None: - """Validate required columns exist; raise ValueError if any are missing.""" - missing_columns: List[str] = [] - if input_column not in dataset._df.columns: - missing_columns.append(input_column) - if actual_output_column not in dataset._df.columns: - missing_columns.append(actual_output_column) - if missing_columns: - raise ValueError( - f"Required columns {missing_columns} not found in dataset. " - f"Available columns: {dataset._df.columns.tolist()}" - ) - - -def _normalize_tools_called(tools_called_value: Any) -> List[ToolCall]: - """Return tools_called as a list of ToolCall; convert if needed.""" - if isinstance(tools_called_value, list) and all( - isinstance(tool, ToolCall) for tool in tools_called_value - ): - return tools_called_value - from validmind.scorers.llm.deepeval import _convert_to_tool_call_list - - return _convert_to_tool_call_list(tools_called_value) - - -def _prepare_trace_dict( - row: Any, - agent_output_column: str, - input_value: Any, - actual_output_value: Any, -) -> Dict[str, Any]: - """Build trace dict from row with required 'input' and 'output' keys.""" - trace_dict = row.get(agent_output_column, {}) - if not isinstance(trace_dict, dict): - trace_dict = {} - if "input" not in trace_dict: - trace_dict["input"] = input_value - if "output" not in trace_dict: - trace_dict["output"] = actual_output_value - return trace_dict - - -def _evaluate_single_step_efficiency( - test_case: LLMTestCase, - metric: StepEfficiencyMetric, -) -> Dict[str, Any]: - """Run StepEfficiencyMetric on one test case; return score and reason or fallback.""" - try: - result = evaluate(test_cases=[test_case], metrics=[metric]) - metric_data = result.test_results[0].metrics_data[0] - score = metric_data.score - reason = getattr(metric_data, "reason", "No reason provided") - return {"score": score, "reason": reason} - except (UnboundLocalError, AttributeError, KeyError) as e: - error_msg = str(e) - if "prompt" in error_msg or "referenced before assignment" in error_msg: - return { - "score": 0.0, - "reason": ( - f"StepEfficiency evaluation failed: The agent trace may not contain " - f"sufficient execution steps for analysis. StepEfficiencyMetric requires " - f"a complete execution trace with step-by-step actions. " - f"Original error: {error_msg}" - ), - } - raise - except Exception as e: - return { - "score": 0.0, - "reason": ( - f"StepEfficiency evaluation failed: {str(e)}. " - f"This metric requires a properly structured agent execution trace." - ), - } - - -@scorer() -@tags("llm", "deepeval", "agent_evaluation", "action_layer", "agentic") -@tasks("llm") -def StepEfficiency( - dataset: VMDataset, - threshold: float = 0.5, - input_column: str = "input", - actual_output_column: str = "actual_output", - agent_output_column: str = "agent_output", - tools_called_column: str = "tools_called", - strict_mode: bool = False, -) -> List[Dict[str, Any]]: - """Evaluates agent step efficiency using deepeval's StepEfficiencyMetric. - - This metric evaluates whether the agent avoids unnecessary or redundant steps - in completing the given task. It analyzes the agent's full execution trace - to assess the efficiency of the execution steps. - - Note: StepEfficiencyMetric requires a complete execution trace with step-by-step - actions. If the trace structure is incomplete or doesn't contain sufficient - execution steps, the evaluation may fail and return a score of 0.0 with an - explanatory reason. - - Args: - dataset: Dataset containing the agent input and execution trace - threshold: Minimum passing threshold (default: 0.5) - input_column: Column name for the task input (default: "input") - actual_output_column: Column name for the agent's final output (default: "actual_output") - agent_output_column: Column name for agent output containing trace (default: "agent_output") - tools_called_column: Column name for tools called by the agent (default: "tools_called") - strict_mode: If True, enforces a binary score (0 or 1) - - Returns: - List[Dict[str, Any]] with keys "score" and "reason" for each row. - If evaluation fails due to incomplete trace structure, returns score 0.0 - with an explanatory reason message. - - Raises: - ValueError: If required columns are missing - """ - _validate_step_efficiency_columns(dataset, input_column, actual_output_column) - - _, model = get_client_and_model() - metric = StepEfficiencyMetric( - threshold=threshold, - model=model, - include_reason=True, - strict_mode=strict_mode, - verbose_mode=False, - ) - - results: List[Dict[str, Any]] = [] - for _, row in dataset._df.iterrows(): - input_value = row[input_column] - actual_output_value = row[actual_output_column] - tools_called_value = _normalize_tools_called(row.get(tools_called_column, [])) - trace_dict = _prepare_trace_dict( - row, agent_output_column, input_value, actual_output_value - ) - test_case = LLMTestCase( - input=input_value, - actual_output=actual_output_value, - tools_called=tools_called_value, - _trace_dict=trace_dict, - ) - results.append(_evaluate_single_step_efficiency(test_case, metric)) - - return results diff --git a/validmind/scorers/llm/deepeval/__init__.py b/validmind/scorers/llm/deepeval/__init__.py index bcce1be0d..4a1de3536 100644 --- a/validmind/scorers/llm/deepeval/__init__.py +++ b/validmind/scorers/llm/deepeval/__init__.py @@ -13,7 +13,6 @@ from .ArgumentCorrectness import ArgumentCorrectness from .PlanAdherence import PlanAdherence from .PlanQuality import PlanQuality -from .StepEfficiency import StepEfficiency from .ToolCorrectness import ToolCorrectness __all__ = [ @@ -21,7 +20,6 @@ "ArgumentCorrectness", "PlanAdherence", "PlanQuality", - "StepEfficiency", "ToolCorrectness", "_extract_tool_responses", "_extract_tool_calls_from_message", diff --git a/validmind/tests/__types__.py b/validmind/tests/__types__.py index 17bffdcbe..d936d38f6 100644 --- a/validmind/tests/__types__.py +++ b/validmind/tests/__types__.py @@ -219,7 +219,6 @@ "validmind.scorers.llm.deepeval.Hallucination", "validmind.scorers.llm.deepeval.PlanAdherence", "validmind.scorers.llm.deepeval.PlanQuality", - "validmind.scorers.llm.deepeval.StepEfficiency", "validmind.scorers.llm.deepeval.Summarization", "validmind.scorers.llm.deepeval.TaskCompletion", "validmind.scorers.llm.deepeval.ToolCorrectness",