diff --git a/site/Makefile b/site/Makefile index d944db545f..150bda6156 100644 --- a/site/Makefile +++ b/site/Makefile @@ -326,8 +326,10 @@ notebooks: @rm -rf $(DEST_DIR_NB)/templates @echo "Copying _metadata.yml into notebooks/ ..." @cp developer/_metadata.yml $(DEST_DIR_NB)/_metadata.yml - @echo "Updating developer/_sidebar.yaml with code_samples directories ..." - @python scripts/update_code_samples_sidebar.py + @echo "Updating developer/_sidebar.yaml with use_cases directories ..." + @python scripts/developer-sidebar/update_use_cases.py + @echo "Updating developer/_sidebar.yaml with how-to directories ..." + @python scripts/developer-sidebar/update_how_tos.py @echo "Zip up notebooks.zip ..." @zip -r notebooks.zip $(DEST_DIR_NB) > /dev/null 2>&1 diff --git a/site/about/overview-model-documentation.qmd b/site/about/overview-model-documentation.qmd index 17c56cb014..a57598481b 100644 --- a/site/about/overview-model-documentation.qmd +++ b/site/about/overview-model-documentation.qmd @@ -150,8 +150,8 @@ How the {{< var validmind.developer >}} works: [^3]: [Customize document templates](/guide/templates/customize-document-templates.qmd) -[^4]: [Implement custom tests](/notebooks/code_samples/custom_tests/implement_custom_tests.ipynb) +[^4]: [Implement custom tests](/notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb) -[^5]: [Integrate external test providers](/notebooks/code_samples/custom_tests/integrate_external_test_providers.ipynb) +[^5]: [Integrate external test providers](/notebooks/how_to/tests/custom_tests/integrate_external_test_providers.ipynb) diff --git a/site/about/use-cases/e-23.qmd b/site/about/use-cases/e-23.qmd index 93e06c3c63..a288345a37 100644 --- a/site/about/use-cases/e-23.qmd +++ b/site/about/use-cases/e-23.qmd @@ -139,7 +139,7 @@ Use this checklist to track your E-23 implementation progress: [^3]: [Working with document templates](/guide/templates/working-with-document-templates.qmd) -[^4]: [Run tests and test suites](/developer/model-testing/testing-overview.qmd) +[^4]: [Run tests and test suites](/developer/how-to/testing-overview.qmd) [^5]: [Working with model documentation](/guide/model-documentation/working-with-model-documentation.qmd) @@ -149,7 +149,7 @@ Use this checklist to track your E-23 implementation progress: [^8]: [Ongoing monitoring](/guide/monitoring/ongoing-monitoring.qmd) -[^9]: [Run tests and test suites](/developer/model-testing/testing-overview.qmd) +[^9]: [Run tests and test suites](/developer/how-to/testing-overview.qmd) [^10]: [Working with model documentation](/guide/model-documentation/working-with-model-documentation.qmd) diff --git a/site/about/use-cases/eu-ai-act.qmd b/site/about/use-cases/eu-ai-act.qmd index af5607cda2..2f6504ffa5 100644 --- a/site/about/use-cases/eu-ai-act.qmd +++ b/site/about/use-cases/eu-ai-act.qmd @@ -233,11 +233,11 @@ Integrate all components into a complete compliance workflow addressing Articles [^4]: [Install and initialize the {{< var validmind.developer >}}](/developer/model-documentation/install-and-initialize-validmind-library.qmd) [^5]: -[Run tests and test suites](/developer/model-testing/testing-overview.qmd) +[Run tests and test suites](/developer/how-to/testing-overview.qmd) [^6]: [Work with document templates](/guide/templates/working-with-document-templates.qmd) -[^7]: [Test descriptions](/developer/model-testing/test-descriptions.qmd) +[^7]: [Test descriptions](/developer/test-descriptions.qmd) [^8]: [Work with content blocks](/guide/model-documentation/work-with-content-blocks.qmd) diff --git a/site/about/use-cases/sr-11-7.qmd b/site/about/use-cases/sr-11-7.qmd index c6f971561b..a5462a32c0 100644 --- a/site/about/use-cases/sr-11-7.qmd +++ b/site/about/use-cases/sr-11-7.qmd @@ -211,7 +211,7 @@ Establish governance framework aligned to SR 11-7. [^5]: [Working with model documentation](/guide/model-documentation/working-with-model-documentation.qmd) -[^6]: [Run tests and test suites](/developer/model-testing/testing-overview.qmd) +[^6]: [Run tests and test suites](/developer/how-to/testing-overview.qmd) [^7]: [Preparing validation reports](/guide/model-validation/preparing-validation-reports.qmd) diff --git a/site/developer/_sidebar.yaml b/site/developer/_sidebar.yaml index 3b1680ea69..1215f1cf41 100644 --- a/site/developer/_sidebar.yaml +++ b/site/developer/_sidebar.yaml @@ -19,65 +19,92 @@ website: file: developer/model-documentation/install-and-initialize-validmind-library.qmd - developer/model-documentation/store-credentials-in-env-file.qmd - text: "---" - - text: "Model Development" + - text: "End-to-End Tutorials" # USING THE VARIABLE IN THE LINK TEXT MESSES UP THE MOBILE VIEW & BREADCRUMB - - text: "1 — Set up ValidMind Library" - file: notebooks/tutorials/model_development/1-set_up_validmind.ipynb - - text: "2 — Start model development process" - file: notebooks/tutorials/model_development/2-start_development_process.ipynb - - text: "3 — Integrate custom tests" - file: notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb - - text: "4 — Finalize testing & documentation" - file: notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb - - text: "---" - - text: "Model Validation" - # USING THE VARIABLE IN THE LINK TEXT MESSES UP THE MOBILE VIEW & BREADCRUMB - - text: "1 — Set up ValidMind Library for validation" - file: notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb - - text: "2 — Start model validation process" - file: notebooks/tutorials/model_validation/2-start_validation_process.ipynb - - text: "3 — Developing a challenger model" - file: notebooks/tutorials/model_validation/3-developing_challenger_model.ipynb - - text: "4 — Finalize validation & reporting" - file: notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb + - section: "Model development" + contents: + - text: "1 — Set up ValidMind Library" + file: notebooks/tutorials/model_development/1-set_up_validmind.ipynb + - text: "2 — Start model development process" + file: notebooks/tutorials/model_development/2-start_development_process.ipynb + - text: "3 — Integrate custom tests" + file: notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb + - text: "4 — Finalize testing & documentation" + file: notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb + - section: "Model validation" + contents: + - text: "1 — Set up ValidMind Library for validation" + file: notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb + - text: "2 —\u00A0Start model validation process" + file: notebooks/tutorials/model_validation/2-start_validation_process.ipynb + - text: "3 — Developing a challenger model" + file: notebooks/tutorials/model_validation/3-developing_challenger_model.ipynb + - text: "4 — Finalize validation & reporting" + file: notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb - text: "---" - - text: "Model Testing" + - text: "How-To" - text: "Run tests & test suites" - file: developer/model-testing/testing-overview.qmd - contents: "notebooks/how_to/**" - - text: "Test descriptions" - file: developer/model-testing/test-descriptions.qmd - contents: tests/** - - developer/model-testing/test-sandbox.qmd + file: developer/how-to/testing-overview.qmd + contents: + - section: "Explore tests" + contents: + - notebooks/how_to/tests/explore_tests/explore_tests.ipynb + - notebooks/how_to/tests/explore_tests/explore_test_suites.ipynb + - developer/how-to/test-sandbox.qmd + - section: "Run tests" + contents: + - notebooks/how_to/tests/run_tests/1_run_dataset_based_tests.ipynb + - notebooks/how_to/tests/run_tests/2_run_comparison_tests.ipynb + - section: "Configuring tests" + contents: "notebooks/how_to/tests/run_tests/configure_tests/*.ipynb" + - section: "Using tests in documentation" + contents: "notebooks/how_to/tests/run_tests/documentation_tests/*.ipynb" + - section: "Custom tests" + contents: + - notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb + - notebooks/how_to/tests/custom_tests/integrate_external_test_providers.ipynb + - text: "Use library features" + file: developer/how-to/feature-overview.qmd + contents: + - section: "Data and datasets" + contents: + - notebooks/how_to/data_and_datasets/use_dataset_model_objects.ipynb + - section: "Dataset inputs" + contents: "notebooks/how_to/data_and_datasets/dataset_inputs/**/*.ipynb" + - section: "Metrics" + contents: "notebooks/how_to/metrics/**/*.ipynb" + - section: "Scoring" + contents: "notebooks/how_to/scoring/**/*.ipynb" - text: "---" - text: "Notebooks" - text: "Code samples" file: developer/samples-jupyter-notebooks.qmd contents: - section: "Agents" - contents: "notebooks/code_samples/agents/**" + contents: "notebooks/use_cases/agents/**/*.ipynb" - section: "Capital markets" - contents: "notebooks/code_samples/capital_markets/**" + contents: "notebooks/use_cases/capital_markets/**/*.ipynb" - section: "Code explainer" - contents: "notebooks/code_samples/code_explainer/**" + contents: "notebooks/use_cases/code_explainer/**/*.ipynb" - section: "Credit risk" - contents: "notebooks/code_samples/credit_risk/**" - - section: "Custom tests" - contents: "notebooks/code_samples/custom_tests/**" + contents: "notebooks/use_cases/credit_risk/**/*.ipynb" - section: "Model validation" - contents: "notebooks/code_samples/model_validation/**" + contents: "notebooks/use_cases/model_validation/**/*.ipynb" - section: "NLP and LLM" - contents: "notebooks/code_samples/nlp_and_llm/**" + contents: "notebooks/use_cases/nlp_and_llm/**/*.ipynb" - section: "Ongoing monitoring" - contents: "notebooks/code_samples/ongoing_monitoring/**" + contents: "notebooks/use_cases/ongoing_monitoring/**/*.ipynb" - section: "Regression" - contents: "notebooks/code_samples/regression/**" + contents: "notebooks/use_cases/regression/**/*.ipynb" - section: "Time series" - contents: "notebooks/code_samples/time_series/**" + contents: "notebooks/use_cases/time_series/**/*.ipynb" - text: "---" - text: "Reference" + - text: "Test descriptions" + file: developer/test-descriptions.qmd + contents: tests/** - text: "{{< var validmind.api >}}" file: validmind/validmind.qmd - - reference/validmind-rest-api-vm.qmd - - + # USING THE VARIABLE IN THE LINK TEXT MESSES UP THE MOBILE VIEW & BREADCRUMB + - text: "ValidMind Public REST API" + file: reference/validmind-rest-api-vm.qmd diff --git a/site/developer/how-to/feature-overview.qmd b/site/developer/how-to/feature-overview.qmd new file mode 100644 index 0000000000..7f9536acfd --- /dev/null +++ b/site/developer/how-to/feature-overview.qmd @@ -0,0 +1,69 @@ +--- +# Copyright © 2023-2026 ValidMind Inc. All rights reserved. +# Refer to the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial +# USING THE VARIABLE MESSES UP THE SPACING ON THE PREVIEW TILE +title: "How to use {{< var validmind.developer >}} features" +date: last-modified +listing: + - id: tests + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../../notebooks/how_to/tests/**/*.ipynb" + - id: data-and-datasets + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../../notebooks/how_to/data_and_datasets/**/*.ipynb" + - id: metrics + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../../notebooks/how_to/metrics/**/*.ipynb" + - id: scoring + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../../notebooks/how_to/scoring/**/*.ipynb" +--- + +Browse our range of Jupyter Notebooks demonstrating how to use the core features of the {{< var validmind.developer >}}. Use these how-to notebooks to get familiar with the {{< var vm.developer >}}'s capabilities and apply them to your own use cases. + +## How-to by topic + +:::{.panel-tabset} + +## Testing + +:::{#tests} +::: + +## Data and datasets + +:::{#data-and-datasets} +::: + +## Metrics + +:::{#metrics} +::: + +## Scoring + +:::{#scoring} +::: + +::: diff --git a/site/developer/model-testing/test-sandbox.qmd b/site/developer/how-to/test-sandbox.qmd similarity index 94% rename from site/developer/model-testing/test-sandbox.qmd rename to site/developer/how-to/test-sandbox.qmd index 4e842d5db3..c9ea3a23c3 100644 --- a/site/developer/model-testing/test-sandbox.qmd +++ b/site/developer/how-to/test-sandbox.qmd @@ -6,6 +6,7 @@ title: "Test sandbox [beta]{.smallcaps}" date: last-modified aliases: - /guide/test-sandbox.html + - /developer/model-testing/test-sandbox.html --- - -{{< include /about/glossary/key_concepts/_key-concepts.qmd >}} - -::: - -## Getting started - -Start by running a pre-made test, then modify it, and finally create your own test: - -:::{#tests-beginner} -::: - -## Explore tests and test suites - -Next, find available tests and test suites using the {{< var vm.developer >}} or the interactive test sandbox: - -:::{#tests-explore} -::: - -::: {.grid} -::: {.g-col-8} -{{< video https://www.youtube.com/embed/5J7wysDIXgI?si=KPkFhx3O6NknijRf title='How do I add tests?' >}} -::: -::: - -## Intermediate - -Building on previous sections, add your own test provider, set up datasets, run tests on individual sections in your model documentation, and more: - -:::{#tests-intermediate} -::: - -## Advanced - -Need more? Try some of the advanced features provided by the {{< var vm.developer >}}: - -:::{#tests-advanced} -::: - -## When do I use tests and test suites? - -While you have the flexibility to decide when to use which {{< var vm.product >}} tests, we have identified a few typical scenarios with their own characteristics and needs: - -:::: {.flex .flex-wrap .justify-around} - -::: {.w-30-ns} - -#### Dataset testing - -To document and validate your dataset: - -- For generic tabular datasets: use the [`tabular_dataset`](/validmind/validmind/test_suites/tabular_datasets.qmd){target="_blank"} test suite. -- For time-series datasets: use the [`time_series_dataset`](/validmind/validmind/test_suites/time_series.qmd#timeseriesdataset){target="_blank"} test suite. - -::: - -::: {.w-30-ns} - -#### Model testing - -To document and validate your model: - -- For binary classification models: use the [`classifier`](/validmind/validmind/test_suites/classifier.qmd){target="_blank"} test suite. -- For time series models: use the [`timeseries`](/validmind/validmind/test_suites/time_series.qmd){target="_blank"} test suite. - -::: - -::: {.w-30-ns} - -#### End-to-end testing - -To document a binary classification model and the relevant dataset end-to-end: - -Use the [`classifier_full_suite`](/validmind/validmind/test_suites/classifier.qmd#classifierfullsuite){target="_blank"} test suite. - -::: - -:::: - -## Can I use my own tests? - -Absolutely! {{< var vm.product >}} supports custom tests that you develop yourself or that are provided by third-party test libraries, also referred to as _test providers_. We provide instructions with code examples that you can adapt: - -:::{#tests-custom} -::: - -## {{< var validmind.api >}} reference - -[{{< var validmind.developer >}} API Reference](/validmind/validmind.qmd){target="_blank" .button .button-green} diff --git a/site/developer/samples-jupyter-notebooks.qmd b/site/developer/samples-jupyter-notebooks.qmd index 7040dcc9ac..11a5aa88c7 100644 --- a/site/developer/samples-jupyter-notebooks.qmd +++ b/site/developer/samples-jupyter-notebooks.qmd @@ -5,20 +5,84 @@ title: "Code samples" date: last-modified listing: - type: grid - image-placeholder: "jupyter-logo-rectangle.svg" - max-description-length: 250 - image-height: "100%" - fields: [title, description, reading-time] - contents: - - "../notebooks/code_samples/*/*.ipynb" - - "../notebooks/code_samples/custom_tests/integrate_external_test_providers.ipynb" + - id: agents + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../notebooks/use_cases/agents/*.ipynb" + - id: capital-markets + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../notebooks/use_cases/capital_markets/*.ipynb" + - id: code-explainer + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../notebooks/use_cases/code_explainer/*.ipynb" + - id: credit-risk + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../notebooks/use_cases/credit_risk/*.ipynb" + - id: model-validation + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../notebooks/use_cases/model_validation/*.ipynb" + - id: nlp-and-llm + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../notebooks/use_cases/nlp_and_llm/*.ipynb" + - id: ongoing-monitoring + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../notebooks/use_cases/ongoing_monitoring/*.ipynb" + - id: regression + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../notebooks/use_cases/regression/*.ipynb" + - id: time-series + type: grid + grid-columns: 2 + image-placeholder: "jupyter-logo-rectangle.svg" + max-description-length: 350 + image-height: "100%" + fields: [title, description, reading-time] + contents: "../notebooks/use_cases/time_series/*.ipynb" fig-cap-location: top aliases: - /guide/samples-jupyter-notebooks.html --- -Our Jupyter Notebook code samples showcase the capabilities and features of the {{< var validmind.developer >}}, while also providing you with useful examples that you can build on and adapt for your own use cases. +Our Jupyter Notebook code samples showcase the capabilities and features of the {{< var validmind.developer >}}, while also providing you with useful examples that you can build on and adapt for your own use cases. :::: {.flex .flex-wrap .justify-around} @@ -43,31 +107,53 @@ Our Jupyter Notebook code samples showcase the capabilities and features of the :::: -## Quickstarts +## By use case -Learn how to develop or validate a simple customer churn model with {{< var vm.product >}}. The easiest way to try the quickstarts are on JupyterHub: +:::{.panel-tabset} -:::: {.flex .flex-wrap .justify-around} +## Agents -::: {.w-50-ns} +:::{#agents} +::: -::: {.tc} -[{{< fa code >}} Quickstart for model documentation]({{< var url.jupyterhub >}}/hub/user-redirect/lab/tree/quickstart/quickstart_model_documentation.ipynb){.button target="_blank"} +## Capital markets +:::{#capital-markets} ::: +## Code explainer + +:::{#code-explainer} ::: -::: {.w-40-ns} +## Credit risk + +:::{#credit-risk} +::: -::: {.tc} -[{{< fa code >}} Quickstart for model validation]({{< var url.jupyterhub >}}/hub/user-redirect/lab/tree/quickstart/quickstart_model_validation.ipynb){.button target="_blank"} +## Model validation +:::{#model-validation} ::: +## NLP and LLM + +:::{#nlp-and-llm} ::: -:::: +## Ongoing monitoring +:::{#ongoing-monitoring} +::: -## Code samples for your use case +## Regression + +:::{#regression} +::: + +## Time series + +:::{#time-series} +::: + +::: diff --git a/site/developer/supported-models.qmd b/site/developer/supported-models.qmd index 76e961af43..930db0533f 100644 --- a/site/developer/supported-models.qmd +++ b/site/developer/supported-models.qmd @@ -10,13 +10,15 @@ aliases: listing: - id: next-models type: grid + grid-columns: 2 max-description-length: 250 sort: false fields: [title, description] contents: - - ../model-testing/testing-overview.qmd - - ../model-testing/test-descriptions.qmd - - ../samples-jupyter-notebooks.qmd + - /how-to/testing-overview.qmd + - test-descriptions.qmd + - /how-to/feature-overview.qmd + - samples-jupyter-notebooks.qmd --- The {{< var validmind.developer >}} provides out-of-the-box support for testing and documentation for an array of model types and modeling packages. @@ -154,5 +156,5 @@ Analyzes data points collected or sequenced over time. [^1]: - - [Implement custom tests](/notebooks/code_samples/custom_tests/implement_custom_tests.ipynb) - - [Integrate external test providers](/notebooks/code_samples/custom_tests/integrate_external_test_providers.ipynb) \ No newline at end of file + - [Implement custom tests](/notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb) + - [Integrate external test providers](/notebooks/how_to/tests/custom_tests/integrate_external_test_providers.ipynb) \ No newline at end of file diff --git a/site/developer/model-testing/test-descriptions.qmd b/site/developer/test-descriptions.qmd similarity index 83% rename from site/developer/model-testing/test-descriptions.qmd rename to site/developer/test-descriptions.qmd index 4a248ee33d..fc82f7d8bc 100644 --- a/site/developer/model-testing/test-descriptions.qmd +++ b/site/developer/test-descriptions.qmd @@ -6,27 +6,28 @@ title: "Test descriptions" date: last-modified aliases: - /guide/test-descriptions.html + - /developer/model-testing/test-descriptions.html listing: - id: data-validation - contents: "../../tests/data_validation/*.md" + contents: "../tests/data_validation/*.md" type: grid max-description-length: 250 page-size: 150 fields: [title, description] - id: model-validation type: grid - contents: "../../tests/model_validation/*.md" + contents: "../tests/model_validation/*.md" max-description-length: 250 page-size: 150 fields: [title, description] - id: prompt-validation - contents: "../../tests/prompt_validation/*.md" + contents: "../tests/prompt_validation/*.md" type: grid max-description-length: 250 page-size: 150 fields: [title, description] - id: ongoing-monitoring - contents: "../../tests/ongoing_monitoring/*.md" + contents: "../tests/ongoing_monitoring/*.md" type: grid max-description-length: 250 page-size: 150 @@ -36,7 +37,7 @@ listing: Tests that are available as part of the {{< var validmind.developer >}}, grouped by type of validation or monitoring test. ::: {.callout} -## {{< fa flask >}} [Try the test sandbox [beta]{.smallcaps}](test-sandbox.qmd) +## {{< fa flask >}} [Try the test sandbox [beta]{.smallcaps}](how-to/test-sandbox.qmd) Explore our interactive sandbox to see what tests are available in the {{< var validmind.developer >}}. ::: @@ -63,4 +64,4 @@ Explore our interactive sandbox to see what tests are available in the {{< var v :::{#ongoing-monitoring} ::: -::: \ No newline at end of file +::: diff --git a/site/developer/validmind-library.qmd b/site/developer/validmind-library.qmd index 26d5a0c69b..8970eb565f 100644 --- a/site/developer/validmind-library.qmd +++ b/site/developer/validmind-library.qmd @@ -55,25 +55,25 @@ listing: - path: ../notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb title: "4 — Finalize validation and reporting" description: "Wrap up by learning how to include custom tests and verifying that all tests conducted during model development were run and reported accurately. By the end of this series, you will have a validation report complete with artifacts ready for review." - - id: run-tests + - id: library-features grid-columns: 2 type: grid max-description-length: 250 sort: false fields: [title, description] - contents: - - ../notebooks/how_to/run_tests/1_run_dataset_based_tests.ipynb -# - ../notebooks/archive/configure_test_parameters.ipynb - - ../notebooks/code_samples/custom_tests/implement_custom_tests.ipynb + contents: + - how-to/testing-overview.qmd + - how-to/feature-overview.qmd - id: code-samples + grid-columns: 3 type: grid - max-description-length: 250 + max-description-length: 350 sort: false fields: [title, description] - contents: - - ../notebooks/code_samples/custom_tests/integrate_external_test_providers.ipynb - - ../notebooks/code_samples/nlp_and_llm/prompt_validation_demo.ipynb - - ../notebooks/code_samples/time_series/quickstart_time_series_full_suite.ipynb + contents: + - ../notebooks/use_cases/agents/document_agentic_ai.ipynb + - ../notebooks/use_cases/credit_risk/document_excel_application_scorecard.ipynb + - ../notebooks/use_cases/model_validation/validate_application_scorecard.ipynb - id: library-documentation type: grid grid-columns: 2 @@ -110,14 +110,18 @@ The {{< var validmind.developer >}} is designed to be model agnostic. If your mo ## Quickstart -After you [**sign up**](/guide/access/accessing-validmind.qmd) for {{< var vm.product >}} to get access, try our quickstarts for model documentation or validation: +After you [**sign up**](/guide/access/accessing-validmind.qmd) for {{< var vm.product >}} to get access, try our Jupyter Notebook quickstarts for model documentation or validation: :::{#library-quickstart} ::: +## End-to-end tutorials + +Learn how to use the {{< var validmind.developer >}} with our high-level Jupyter Notebook courses covering usage of ValidMind for specific roles or concepts. + -## {{< var vm.product >}} for model development +#### {{< var vm.product >}} for model development Learn how to use ValidMind for your end-to-end model documentation process based on common model development scenarios with our *ValidMind for model development* series of four introductory notebooks: @@ -126,7 +130,7 @@ Learn how to use ValidMind for your end-to-end model documentation process based -## {{< var vm.product >}} for model validation +#### {{< var vm.product >}} for model validation Learn how to use ValidMind for your end-to-end model validation process based on common scenarios with our *ValidMind for model validation* series of four introductory notebooks: @@ -134,66 +138,52 @@ Learn how to use ValidMind for your end-to-end model validation process based on ::: -## Learn how to run tests +## Learn how to use the {{< var validmind.developer >}} :::: {.flex .flex-wrap .justify-around} -::: {.w-70-ns} -The {{< var validmind.developer >}} provides many built-in tests and test suites which make it easy for developers to automate their model documentation. Start by running a pre-made test, then modify it, and finally create your own test: - -::: - -::: {.w-30-ns .tc} -[Run tests & test suites](model-testing/testing-overview.qmd){.button .button-green} +Learn how to use the comprehensive out-of-the-box tests and test suites, and other features in the {{< var validmind.developer >}} that make it easy for you to automate building, documenting, validating your models and more. +:::{#library-features} ::: :::: -:::{#run-tests} -::: - -## Try the code samples +## Try code samples by use case :::: {.flex .flex-wrap .justify-around} ::: {.w-70-ns} -Our code samples showcase the capabilities of the {{< var validmind.developer >}}. Examples that you can build on and adapt for your own use cases include: - +Try our Jupyter Notebook code samples that showcase the capabilities of the {{< var validmind.developer >}} and cover a variety of sample use cases. ::: -::: {.w-30-ns .tc} -[All code samples](samples-jupyter-notebooks.qmd){.button .button-green} +::: {.w-30-ns .tr} +[Code samples by use case](samples-jupyter-notebooks.qmd){.button .button-green} ::: :::: +Examples that you can build on and adapt for your own usage include: + :::{#code-samples} ::: ## Work with model documentation -:::: {.flex .flex-wrap .justify-around} - -::: {.w-60-ns .pr3} -After you have tried out the {{< var validmind.developer >}}, continue working with your model documentation in the {{< var validmind.platform >}}: - -::: - -::: {.w-40-ns .tc} -[Working with model documentation](/guide/model-documentation/working-with-model-documentation.qmd){.button .button-green} - -::: - -:::: +After you have tried out the {{< var validmind.developer >}}, continue working with your model documentation in the {{< var validmind.platform >}}:[^3] :::{#library-documentation} ::: +## {{< var validmind.api >}} reference + +[{{< var validmind.developer >}} API Reference](/validmind/validmind.qmd){target="_blank" .button .button-green} [^1]: [{{< var vm.product >}} for model development](#development) -[^2]: [{{< var vm.product >}} for model validation](#validation) \ No newline at end of file +[^2]: [{{< var vm.product >}} for model validation](#validation) + +[^3]: [Working with model documentation](/guide/model-documentation/working-with-model-documentation.qmd) \ No newline at end of file diff --git a/site/faq/_faq-images.qmd b/site/faq/_faq-images.qmd index 22b479f7b5..2b640cdd5f 100644 --- a/site/faq/_faq-images.qmd +++ b/site/faq/_faq-images.qmd @@ -4,7 +4,7 @@ SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial --> ## Do you support including images in model documentation? -Yes, as long as you can produce the image with Python or open the image from a file, you can include it in your documentation with {{< var vm.product >}}:^[[Implement custom tests](/notebooks/code_samples/custom_tests/implement_custom_tests.ipynb#custom-test-images)] +Yes, as long as you can produce the image with Python or open the image from a file, you can include it in your documentation with {{< var vm.product >}}:^[[Implement custom tests](/notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb#custom-test-images)] - If you want to log an image as a test result, you can do so by passing the path to the image as a parameter to the custom test and then opening the file in the test function. - If you are using a plotting library that isn’t directly supported by {{< var vm.product >}}, you can still return the image directly as a bytes-like object. \ No newline at end of file diff --git a/site/faq/_faq-synthetic-datasets.qmd b/site/faq/_faq-synthetic-datasets.qmd index 3851aa7635..e406354952 100644 --- a/site/faq/_faq-synthetic-datasets.qmd +++ b/site/faq/_faq-synthetic-datasets.qmd @@ -4,5 +4,5 @@ SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial --> ## Does {{< var vm.product >}} support using synthetic datasets? -- The {{< var validmind.developer >}} supports you bringing your own datasets, including synthetic datasets, for testing and benchmarking purposes such as for fair lending and bias testing.^[[Document a Credit Risk Model](/notebooks/code_samples/credit_risk/application_scorecard_with_bias.ipynb)] +- The {{< var validmind.developer >}} supports you bringing your own datasets, including synthetic datasets, for testing and benchmarking purposes such as for fair lending and bias testing.^[[Document a Credit Risk Model](/notebooks/use_cases/credit_risk/application_scorecard_with_bias.ipynb)] - If you are unable to share your real-world data with us, {{< var vm.product >}} is happy to work with you to generate custom synthetic datasets based on characteristics of your data, or provide scripts to assist with synthetic dataset generation if details cannot be shared. \ No newline at end of file diff --git a/site/faq/faq-integrations.qmd b/site/faq/faq-integrations.qmd index b7c3b0c3bf..8c3d7ac9ea 100644 --- a/site/faq/faq-integrations.qmd +++ b/site/faq/faq-integrations.qmd @@ -90,6 +90,6 @@ We will be implementing connector interfaces allowing extraction of relationship [^5]: [Do you support including images in model documentation?](#images) -[^6]: [Load dataset predictions](/notebooks/how_to/load_datasets_predictions.ipynb) +[^6]: [Load dataset predictions](/notebooks/how_to/data_and_datasets/dataset_inputs/load_datasets_predictions.ipynb) [^7]: [Do you include explainability-related testing and documentation?](#explanability) \ No newline at end of file diff --git a/site/faq/faq-testing.qmd b/site/faq/faq-testing.qmd index 6024603922..42ddf51e84 100644 --- a/site/faq/faq-testing.qmd +++ b/site/faq/faq-testing.qmd @@ -14,8 +14,8 @@ listing: sort: false fields: [title, description] contents: - - ../developer/model-testing/testing-overview.qmd - - ../developer/model-testing/test-descriptions.qmd + - ../developer/how-to/testing-overview.qmd + - ../developer/test-descriptions.qmd - ../guide/monitoring/ongoing-monitoring.qmd categories: ["testing", "model documentation", "customization", "custom data", "explainability", "ongoing monitoring", "validmind library"] --- @@ -59,7 +59,7 @@ To log tests as a developer with the {{< var validmind.developer >}}: ::: {.callout} ## Want to learn how to use {{< var vm.product >}} as a developer? -Check our our introductory series — [**{{< var vm.product >}} for model development**](/developer/validmind-library.qmd#for-model-development) +Check out our introductory series — [**{{< var vm.product >}} for model development**](/developer/validmind-library.qmd#for-model-development) ::: ## How do I log tests as a validator? @@ -74,7 +74,7 @@ To log tests as a validator with the {{< var validmind.developer >}}: ::: {.callout} ## Want to learn how to use {{< var vm.product >}} as a validator? -Check our our introductory series — [**{{< var vm.product >}} for model validation**](/developer/validmind-library.qmd#for-model-validation) +Check out our introductory series — [**{{< var vm.product >}} for model validation**](/developer/validmind-library.qmd#for-model-validation) ::: {{< include _faq-explainability.qmd >}} @@ -95,17 +95,17 @@ Check our our introductory series — [**{{< var vm.product >}} for model valida [^2]: [Accessing {{< var vm.product >}}](/guide/access/accessing-validmind.qmd) -[^3]: [When do I use tests and test suites?](/developer/model-testing/testing-overview.qmd#when-do-i-use-tests-and-test-suites) +[^3]: [When do I use tests and test suites?](/developer/how-to/testing-overview.qmd#when-do-i-use-tests-and-test-suites) [^4]: [`run_documentation_tests()`](/validmind/validmind.qmd#run_documentation_tests) [^5]: [`ClassImbalance()`](/validmind/validmind/tests/data_validation/ClassImbalance.qmd) -[^6]: [Can I use my own tests?](/developer/model-testing/testing-overview.qmd#can-i-use-my-own-tests) +[^6]: [Can I use my own tests?](/developer/how-to/testing-overview.qmd#can-i-use-my-own-tests) -[^7]: [Understand and utilize `RawData` in {{< var vm.product >}} tests](/notebooks/how_to/understand_utilize_rawdata.ipynb) +[^7]: [Understand and utilize `RawData` in {{< var vm.product >}} tests](/notebooks/how_to/tests/run_tests/configure_tests/understand_utilize_rawdata.ipynb) -[^8]: [Customize test result descriptions](/notebooks/how_to/customize_test_result_descriptions.ipynb) +[^8]: [Customize test result descriptions](/notebooks/how_to/tests/run_tests/configure_tests/customize_test_result_descriptions.ipynb) [^9]: [Work with test results](/guide/model-documentation/work-with-test-results.qmd) diff --git a/site/guide/access/accessing-validmind.qmd b/site/guide/access/accessing-validmind.qmd index 6762377cc3..777ad4fff4 100644 --- a/site/guide/access/accessing-validmind.qmd +++ b/site/guide/access/accessing-validmind.qmd @@ -9,7 +9,7 @@ listing: - id: guides-access type: grid max-description-length: 250 - grid-columns: 2 + grid-columns: 3 sort: false fields: [title, description] contents: diff --git a/site/guide/model-documentation/work-with-content-blocks.qmd b/site/guide/model-documentation/work-with-content-blocks.qmd index 8a29bcda8c..999828aa14 100644 --- a/site/guide/model-documentation/work-with-content-blocks.qmd +++ b/site/guide/model-documentation/work-with-content-blocks.qmd @@ -168,7 +168,7 @@ Test-driven or metric over time blocks can be re-added later on but **text block [^9]: [Collaborate with others](/guide/model-documentation/collaborate-with-others.qmd) -[^10]: [Run tests and test suites](/developer/model-testing/testing-overview.qmd) +[^10]: [Run tests and test suites](/developer/how-to/testing-overview.qmd) [^11]: [View model activity](/guide/model-inventory/view-model-activity.qmd) diff --git a/site/guide/monitoring/enable-monitoring.qmd b/site/guide/monitoring/enable-monitoring.qmd index 9ff0acb8f0..6bfbfa212f 100644 --- a/site/guide/monitoring/enable-monitoring.qmd +++ b/site/guide/monitoring/enable-monitoring.qmd @@ -14,7 +14,7 @@ Enable monitoring with two steps: 2. [Select a monitoring template](#select-a-monitoring-template) ::: {.callout title="To try out monitoring, check out the code sample for ongoing monitoring of models."} -[Quickstart for ongoing monitoring of models with {{< var vm.product >}} {{< fa hand-point-right >}}](/notebooks/code_samples/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb) +[Quickstart for ongoing monitoring of models with {{< var vm.product >}} {{< fa hand-point-right >}}](/notebooks/use_cases/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb) ::: ::: {.attn} diff --git a/site/guide/monitoring/ongoing-monitoring.qmd b/site/guide/monitoring/ongoing-monitoring.qmd index 9242c53664..cb351f3d83 100644 --- a/site/guide/monitoring/ongoing-monitoring.qmd +++ b/site/guide/monitoring/ongoing-monitoring.qmd @@ -23,9 +23,9 @@ listing: sort: false fields: [title, description] contents: - - ../../notebooks/code_samples/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb - - ../../notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb - # - ../../notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb + - ../../notebooks/use_cases/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb + - ../../notebooks/use_cases/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb + # - ../../notebooks/use_cases/credit_risk/application_scorecard_full_suite.ipynb - id: ongoing-monitoring-tests contents: "../../tests/ongoing_monitoring/*.md" type: grid @@ -155,4 +155,4 @@ To try out monitoring, check out the code sample for ongoing monitoring of model [^2]: [Working with model documentation](/guide/model-documentation/working-with-model-documentation.qmd) -[^3]: [Quickstart for ongoing monitoring of models with {{< var vm.product >}}](../../notebooks/code_samples/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb) +[^3]: [Quickstart for ongoing monitoring of models with {{< var vm.product >}}](../../notebooks/use_cases/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb) diff --git a/site/guide/monitoring/review-monitoring-results.qmd b/site/guide/monitoring/review-monitoring-results.qmd index 6a83688a72..3ba3018b45 100644 --- a/site/guide/monitoring/review-monitoring-results.qmd +++ b/site/guide/monitoring/review-monitoring-results.qmd @@ -13,7 +13,7 @@ To ensure your model continues to perform as expected, it's important to regular ::: {.callout title="To try out monitoring, check out the code sample for ongoing monitoring of models."} -[Quickstart for ongoing monitoring of models with {{< var vm.product >}} {{< fa hand-point-right >}}](/notebooks/code_samples/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb) +[Quickstart for ongoing monitoring of models with {{< var vm.product >}} {{< fa hand-point-right >}}](/notebooks/use_cases/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb) ::: ::: {.attn} diff --git a/site/guide/monitoring/work-with-metrics-over-time.qmd b/site/guide/monitoring/work-with-metrics-over-time.qmd index a94057d6ba..e474ab0643 100644 --- a/site/guide/monitoring/work-with-metrics-over-time.qmd +++ b/site/guide/monitoring/work-with-metrics-over-time.qmd @@ -16,7 +16,7 @@ Metrics over time refers to the continued monitoring of a model's performance on ::: {.column-margin} ::: {.callout} -## **[Log metrics over time {{< fa hand-point-right >}}](/notebooks/how_to/log_metrics_over_time.ipynb)** +## **[Log metrics over time {{< fa hand-point-right >}}](/notebooks/how_to/metrics/log_metrics_over_time.ipynb)** Learn how to log metrics over time, set thresholds, and analyze model performance trends with our Jupyter Notebook sample. ::: @@ -94,7 +94,7 @@ After you have added metrics over time to your document, you can view the follow [^1]: [Register models in the inventory](/guide/model-inventory/register-models-in-inventory.qmd) -[^2]: [Log metrics over time](/notebooks/how_to/log_metrics_over_time.ipynb) +[^2]: [Log metrics over time](/notebooks/how_to/metrics/log_metrics_over_time.ipynb) [^3]: [Manage permissions](/guide/configuration/manage-permissions.qmd) diff --git a/site/notebooks.zip b/site/notebooks.zip index 2ec23a970d..683df4e453 100644 Binary files a/site/notebooks.zip and b/site/notebooks.zip differ diff --git a/site/notebooks/EXECUTED/model_development/1-set_up_validmind.ipynb b/site/notebooks/EXECUTED/model_development/1-set_up_validmind.ipynb index f85b592d8a..61ee21e2b5 100644 --- a/site/notebooks/EXECUTED/model_development/1-set_up_validmind.ipynb +++ b/site/notebooks/EXECUTED/model_development/1-set_up_validmind.ipynb @@ -139,7 +139,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html))\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -215,8 +215,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/EXECUTED/model_development/2-start_development_process.ipynb b/site/notebooks/EXECUTED/model_development/2-start_development_process.ipynb index 1d62e9c085..23442b3d30 100644 --- a/site/notebooks/EXECUTED/model_development/2-start_development_process.ipynb +++ b/site/notebooks/EXECUTED/model_development/2-start_development_process.ipynb @@ -221,7 +221,7 @@ "source": [ "
Want to learn more about navigating ValidMind tests?\n", "

\n", - "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests
" + "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests" ] }, { diff --git a/site/notebooks/EXECUTED/model_development/3-integrate_custom_tests.ipynb b/site/notebooks/EXECUTED/model_development/3-integrate_custom_tests.ipynb index 49df9a5b50..9a3cbe7ef7 100644 --- a/site/notebooks/EXECUTED/model_development/3-integrate_custom_tests.ipynb +++ b/site/notebooks/EXECUTED/model_development/3-integrate_custom_tests.ipynb @@ -13,7 +13,7 @@ "- The function can be as simple or as complex as you need it to be — it can use external libraries, make API calls, or do anything else that you can do in Python.\n", "- The only requirement is that the function signature and return values can be \"understood\" and handled by the ValidMind Library. As such, custom tests offer added flexibility by extending the default tests provided by ValidMind, enabling you to document any type of model or use case.\n", "\n", - "**For a more in-depth introduction to custom tests,** refer to our [Implement custom tests](../../code_samples/custom_tests/implement_custom_tests.ipynb) notebook.\n", + "**For a more in-depth introduction to custom tests,** refer to our [Implement custom tests](../../how_to/tests/custom_tests/implement_custom_tests.ipynb) notebook.\n", "\n", "
Learn by doing\n", "

\n", @@ -817,7 +817,7 @@ "\n", "
Want to learn more about test providers?\n", "

\n", - "An extended introduction to test providers can be found in: Integrate external test providers
" + "An extended introduction to test providers can be found in: Integrate external test providers
" ] }, { diff --git a/site/notebooks/EXECUTED/model_development/4-finalize_testing_documentation.ipynb b/site/notebooks/EXECUTED/model_development/4-finalize_testing_documentation.ipynb index 0af95d90a7..04c745225e 100644 --- a/site/notebooks/EXECUTED/model_development/4-finalize_testing_documentation.ipynb +++ b/site/notebooks/EXECUTED/model_development/4-finalize_testing_documentation.ipynb @@ -930,9 +930,9 @@ "\n", "#### Use cases\n", "\n", - "- [Document an application scorecard model](../../code_samples/credit_risk/application_scorecard_full_suite.ipynb)\n", - "- [Linear regression documentation demo](../../code_samples/regression/quickstart_regression_full_suite.ipynb)\n", - "- [LLM model documentation demo](../../code_samples/nlp_and_llm/foundation_models_integration_demo.ipynb)" + "- [Document an application scorecard model](../../use_cases/credit_risk/application_scorecard_full_suite.ipynb)\n", + "- [Linear regression documentation demo](../../use_cases/regression/quickstart_regression_full_suite.ipynb)\n", + "- [LLM model documentation demo](../../use_cases/nlp_and_llm/foundation_models_integration_demo.ipynb)" ] }, { @@ -943,12 +943,12 @@ "\n", "#### More how-to guides and code samples\n", "\n", - "- [Explore available tests in detail](../../how_to/explore_tests.ipynb)\n", - "- [In-depth guide on running dataset based tests](../../how_to/run_tests/1_run_dataset_based_tests.ipynb)\n", - "- [In-depth guide for implementing custom tests](../../code_samples/custom_tests/implement_custom_tests.ipynb)\n", - "- [In-depth guide to external test providers](../../code_samples/custom_tests/integrate_external_test_providers.ipynb)\n", - "- [Configuring dataset features](../../how_to/configure_dataset_features.ipynb)\n", - "- [Introduction to unit and composite metrics](../../how_to/run_unit_metrics.ipynb)" + "- [Explore available tests in detail](../../how_to/tests/explore_tests/explore_tests.ipynb)\n", + "- [In-depth guide on running dataset based tests](../../how_to/tests/run_tests/1_run_dataset_based_tests.ipynb)\n", + "- [In-depth guide for implementing custom tests](../../how_to/tests/custom_tests/implement_custom_tests.ipynb)\n", + "- [In-depth guide to external test providers](../../how_to/tests/custom_tests/integrate_external_test_providers.ipynb)\n", + "- [Configuring dataset features](../../how_to/data_and_datasets/dataset_inputs/configure_dataset_features.ipynb)\n", + "- [Introduction to unit and composite metrics](../../how_to/metrics/run_unit_metrics.ipynb)" ] }, { @@ -961,7 +961,7 @@ "\n", "All notebook samples can be found in the following directories of the ValidMind Library GitHub repository:\n", "\n", - "- [Code samples](https://github.com/validmind/validmind-library/tree/main/notebooks/code_samples)\n", + "- [Use cases](https://github.com/validmind/validmind-library/tree/main/notebooks/use_cases)\n", "- [How-to guides](https://github.com/validmind/validmind-library/tree/main/notebooks/how_to)" ] }, diff --git a/site/notebooks/EXECUTED/model_validation/1-set_up_validmind_for_validation.ipynb b/site/notebooks/EXECUTED/model_validation/1-set_up_validmind_for_validation.ipynb index b9154b80c2..6a2e9e128a 100644 --- a/site/notebooks/EXECUTED/model_validation/1-set_up_validmind_for_validation.ipynb +++ b/site/notebooks/EXECUTED/model_validation/1-set_up_validmind_for_validation.ipynb @@ -141,7 +141,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html))\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -177,8 +177,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down — don’t worry, we’ll adjust these permissions next for validation.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/EXECUTED/model_validation/2-start_validation_process.ipynb b/site/notebooks/EXECUTED/model_validation/2-start_validation_process.ipynb index 48e4dbff66..77383ce0f2 100644 --- a/site/notebooks/EXECUTED/model_validation/2-start_validation_process.ipynb +++ b/site/notebooks/EXECUTED/model_validation/2-start_validation_process.ipynb @@ -237,7 +237,7 @@ "source": [ "
Want to learn more about navigating ValidMind tests?\n", "

\n", - "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests
" + "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests" ] }, { diff --git a/site/notebooks/EXECUTED/model_validation/4-finalize_validation_reporting.ipynb b/site/notebooks/EXECUTED/model_validation/4-finalize_validation_reporting.ipynb index fe4e221c90..6103fa2d41 100644 --- a/site/notebooks/EXECUTED/model_validation/4-finalize_validation_reporting.ipynb +++ b/site/notebooks/EXECUTED/model_validation/4-finalize_validation_reporting.ipynb @@ -13,7 +13,7 @@ "- The function can be as simple or as complex as you need it to be — it can use external libraries, make API calls, or do anything else that you can do in Python.\n", "- The only requirement is that the function signature and return values can be \"understood\" and handled by the ValidMind Library. As such, custom tests offer added flexibility by extending the default tests provided by ValidMind, enabling you to document any type of model or use case.\n", "\n", - "**For a more in-depth introduction to custom tests,** refer to our [Implement custom tests](../../code_samples/custom_tests/implement_custom_tests.ipynb) notebook.\n", + "**For a more in-depth introduction to custom tests,** refer to our [Implement custom tests](../../how_to/tests/custom_tests/implement_custom_tests.ipynb) notebook.\n", "\n", "
Learn by doing\n", "

\n", @@ -480,7 +480,7 @@ "\n", "
Want to learn more about custom tests?\n", "

\n", - "Refer to our in-depth introduction to custom tests: Implement custom tests
" + "Refer to our in-depth introduction to custom tests: Implement custom tests
" ] }, { @@ -856,7 +856,7 @@ "\n", "
Want to learn more about test providers?\n", "

\n", - "An extended introduction to test providers can be found in: Integrate external test providers
" + "An extended introduction to test providers can be found in: Integrate external test providers" ] }, { @@ -1176,10 +1176,10 @@ "\n", "#### More how-to guides and code samples\n", "\n", - "- [Explore available tests in detail](../../how_to/explore_tests.ipynb)\n", - "- [In-depth guide on running dataset based tests](../../how_to/run_tests/1_run_dataset_based_tests.ipynb)\n", - "- [In-depth guide for running comparison tests](../../how_to/run_tests/2_run_comparison_tests.ipynb)\n", - "- [In-depth guide for implementing custom tests](../../code_samples/custom_tests/implement_custom_tests.ipynb)" + "- [Explore available tests in detail](../../how_to/tests/explore_tests/explore_tests.ipynb)\n", + "- [In-depth guide on running dataset based tests](../../how_to/tests/run_tests/1_run_dataset_based_tests.ipynb)\n", + "- [In-depth guide for running comparison tests](../../how_to/tests/run_tests/2_run_comparison_tests.ipynb)\n", + "- [In-depth guide for implementing custom tests](../../how_to/tests/custom_tests/implement_custom_tests.ipynb)" ] }, { @@ -1192,7 +1192,7 @@ "\n", "All notebook samples can be found in the following directories of the ValidMind Library GitHub repository:\n", "\n", - "- [Code samples](https://github.com/validmind/validmind-library/tree/main/notebooks/code_samples)\n", + "- [Use cases](https://github.com/validmind/validmind-library/tree/main/notebooks/use_cases)\n", "- [How-to guides](https://github.com/validmind/validmind-library/tree/main/notebooks/how_to)\n", "\n", "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." diff --git a/site/notebooks/README.md b/site/notebooks/README.md index a641f3e276..f84c2cfb98 100644 --- a/site/notebooks/README.md +++ b/site/notebooks/README.md @@ -5,9 +5,9 @@ Our [Jupyter Notebook](https://jupyter.org/) code samples are designed to showca Sample notebooks are organized into the following folders: * `notebooks/quickstart` — Quick guides to get you started with ValidMind -* `notebooks/tutorials` — Get step-by-step instructions and learn about ValidMind concepts in depth -* `notebooks/how_to` — Learn how to use specific ValidMind features, for example how to list all test suites -* `notebooks/code_samples` — Showcase end-to-end functionality for documenting or validating models +* `notebooks/tutorials` — High-level courses covering usage of ValidMind for specific roles or concepts, such as model development +* `notebooks/how_to` — Learn how to use specific ValidMind features, for example how to run tests +* `notebooks/use_cases` — Demo end-to-end use cases for ValidMind, such as documenting or validating specific kinds of models * `notebooks/code_sharing` — Share your own notebooks or document code internally ## Getting started diff --git a/site/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb b/site/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb deleted file mode 100644 index 9afebb2e6d..0000000000 --- a/site/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb +++ /dev/null @@ -1,1501 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# AI Agent Validation with ValidMind - Banking Demo\n", - "\n", - "This notebook shows how to document and evaluate an agentic AI system with the ValidMind Library. Using a small banking agent built in LangGraph as an example, you will run ValidMind’s built-in and custom tests and produce the artifacts needed to create evidence-backed documentation.\n", - "\n", - "An AI agent is an autonomous system that interprets inputs, selects from available tools or actions, and carries out multi-step behaviors to achieve user goals. In this example, our agent acts as a professional banking assistant that analyzes user requests and automatically selects and invokes the most appropriate specialized banking tool (credit, account, or fraud) to deliver accurate, compliant, and actionable responses.\n", - "\n", - "However, agentic capabilities bring concrete risks. The agent may misinterpret user inputs or fail to extract required parameters, producing incorrect credit assessments or inappropriate account actions; it can select the wrong tool (for example, invoking account management instead of fraud detection), which may cause unsafe, non-compliant, or customer-impacting behaviour.\n", - "\n", - "This interactive notebook guides you step-by-step through building a demo LangGraph banking agent, preparing an evaluation dataset, initializing the ValidMind Library and required objects, writing custom tests for tool-selection accuracy and entity extraction, running ValidMind’s built-in and custom test suites, and logging documentation artifacts to ValidMind." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - "- [About ValidMind](#toc1__) \n", - " - [Before you begin](#toc1_1__) \n", - " - [New to ValidMind?](#toc1_2__) \n", - " - [Key concepts](#toc1_3__) \n", - "- [Setting up](#toc2__) \n", - " - [Install the ValidMind Library](#toc2_1__) \n", - " - [Initialize the ValidMind Library](#toc2_2__) \n", - " - [Register sample model](#toc2_2_1__) \n", - " - [Apply documentation template](#toc2_2_2__) \n", - " - [Get your code snippet](#toc2_2_3__) \n", - " - [Initialize the Python environment](#toc2_3__) \n", - "- [Banking Tools](#toc3__) \n", - " - [Tool Overview](#toc3_1__) \n", - " - [Test Banking Tools Individually](#toc3_2__) \n", - "- [Complete LangGraph Banking Agent](#toc4__) \n", - "- [ValidMind Model Integration](#toc5__) \n", - "- [Prompt Validation](#toc6__) \n", - "- [Banking Test Dataset](#toc7__) \n", - " - [Initialize ValidMind Dataset](#toc7_1__) \n", - " - [Run the Agent and capture result through assign predictions](#toc7_2__) \n", - " - [Dataframe Display Settings](#toc7_2_1__) \n", - "- [Banking Accuracy Test](#toc8__) \n", - "- [Banking Tool Call Accuracy Test](#toc9__) \n", - "- [Scorers in ValidMind](#toc10__)\n", - " - [Plan Quality Metric scorer](#toc10_1) \n", - " - [Plan Adherence Metric scorer](#toc10_2) \n", - " - [Tool Correctness Metric scorer](#toc10_3) \n", - " - [Argument Correctness Metric scorer](#toc10_4) \n", - " - [Task Completion scorer](#toc10_5) \n", - "- [RAGAS Tests for an Agent Evaluation](#toc12__) \n", - " - [Faithfulness](#toc12_1__) \n", - " - [Response Relevancy](#toc12_2__) \n", - " - [Context Recall](#toc12_3__) \n", - "- [Safety](#toc13__) \n", - " - [AspectCritic](#toc13_1__) \n", - " - [Prompt bias](#toc13_2__) \n", - " - [Toxicity](#toc13_3__) \n", - "- [Demo Summary and Next Steps](#toc14__) \n", - " - [What We Built](#toc14_1__) \n", - " - [Next Steps](#toc14_2__) \n", - " - [Key Benefits](#toc14_3__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language.\n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Key concepts\n", - "\n", - "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", - "\n", - "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Install the ValidMind Library\n", - "\n", - "To install the library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q \"validmind[llm]\" " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Agentic AI System`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import validmind as vm\n", - "\n", - "vm.init(\n", - " api_host=\"...\",\n", - " api_key=\"...\",\n", - " api_secret=\"...\",\n", - " model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the Python environment\n", - "\n", - "Next, let's import all the necessary libraries for building our banking LangGraph agent system:\n", - "\n", - "- **LangChain components** for LLM integration and tool management\n", - "- **LangGraph** for building stateful, multi-step agent workflows\n", - "- **ValidMind** for model validation and testing\n", - "- **Banking tools** for specialized financial services\n", - "- **Standard libraries** for data handling and environment management\n", - "\n", - "The setup includes loading environment variables (like OpenAI API keys) needed for the LLM components to function properly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Standard library imports\n", - "from typing import TypedDict, Annotated, Sequence\n", - "\n", - "# Third party imports\n", - "import pandas as pd\n", - "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", - "from langchain_openai import ChatOpenAI\n", - "from langgraph.checkpoint.memory import MemorySaver\n", - "from langgraph.graph import StateGraph, END, START\n", - "from langgraph.graph.message import add_messages\n", - "from langgraph.prebuilt import ToolNode\n", - "\n", - "# Local imports\n", - "from banking_tools import AVAILABLE_TOOLS\n", - "from validmind.tests import run_test\n", - "\n", - "pd.set_option('display.max_columns', None)\n", - "pd.set_option('display.max_colwidth', None)\n", - "pd.set_option('display.width', None)\n", - "pd.set_option('display.max_rows', None)\n", - "\n", - "# Load environment variables if using .env file\n", - "try:\n", - " from dotenv import load_dotenv\n", - " load_dotenv()\n", - "except ImportError:\n", - " print(\"dotenv not installed. Make sure OPENAI_API_KEY is set in your environment.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Banking Tools\n", - "\n", - "Now let's use the following banking demo tools that provide use cases of the financial services:\n", - "\n", - "\n", - "\n", - "### Tool Overview\n", - "1. **Credit Risk Analyzer** - Loan applications and credit decisions\n", - "2. **Customer Account Manager** - Account services and customer support\n", - "3. **Fraud Detection System** - Security and fraud prevention" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Available tools: {len(AVAILABLE_TOOLS)}\")\n", - "print(\"\\nTool Details:\")\n", - "for i, tool in enumerate(AVAILABLE_TOOLS, 1):\n", - " print(f\" - {tool.name}\") " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Test Banking Tools Individually\n", - "\n", - "Let's test each banking tool individually to ensure they're working correctly before integrating them into our agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Testing Individual Banking Tools\")\n", - "print(\"=\" * 60)\n", - "\n", - "# Test 1: Credit Risk Analyzer\n", - "print(\"TEST 1: Credit Risk Analyzer\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " # Access the underlying function using .func\n", - " credit_result = AVAILABLE_TOOLS[0].func(\n", - " customer_income=75000,\n", - " customer_debt=1200,\n", - " credit_score=720,\n", - " loan_amount=50000,\n", - " loan_type=\"personal\"\n", - " )\n", - " print(credit_result)\n", - " print(\"Credit Risk Analyzer test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Credit Risk Analyzer test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)\n", - "\n", - "# Test 2: Customer Account Manager\n", - "print(\"TEST 2: Customer Account Manager\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " # Test checking balance\n", - " account_result = AVAILABLE_TOOLS[1].func(\n", - " account_type=\"checking\",\n", - " customer_id=\"12345\",\n", - " action=\"check_balance\"\n", - " )\n", - " print(account_result)\n", - " \n", - " # Test getting account info\n", - " info_result = AVAILABLE_TOOLS[1].func(\n", - " account_type=\"all\",\n", - " customer_id=\"12345\", \n", - " action=\"get_info\"\n", - " )\n", - " print(info_result)\n", - " print(\"Customer Account Manager test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Customer Account Manager test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)\n", - "\n", - "# Test 3: Fraud Detection System\n", - "print(\"TEST 3: Fraud Detection System\")\n", - "print(\"-\" * 40)\n", - "try:\n", - " fraud_result = AVAILABLE_TOOLS[2].func(\n", - " transaction_id=\"TX123\",\n", - " customer_id=\"12345\",\n", - " transaction_amount=500.00,\n", - " transaction_type=\"withdrawal\",\n", - " location=\"Miami, FL\",\n", - " device_id=\"DEVICE_001\"\n", - " )\n", - " print(fraud_result)\n", - " print(\"Fraud Detection System test PASSED\")\n", - "except Exception as e:\n", - " print(f\"Fraud Detection System test FAILED: {e}\")\n", - "\n", - "print(\"\" + \"=\" * 60)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Complete LangGraph Banking Agent\n", - "\n", - "Now we'll create our intelligent banking agent with LangGraph that can automatically select and use the appropriate banking tools based on user requests." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Enhanced banking system prompt with tool selection guidance\n", - "system_context = \"\"\"You are a professional banking AI assistant with access to specialized banking tools.\n", - " Analyze the user's banking request and directly use the most appropriate tools to help them.\n", - " \n", - " AVAILABLE BANKING TOOLS:\n", - " \n", - " credit_risk_analyzer - Analyze credit risk for loan applications and credit decisions\n", - " - Use for: loan applications, credit assessments, risk analysis, mortgage eligibility\n", - " - Examples: \"Analyze credit risk for $50k personal loan\", \"Assess mortgage eligibility for $300k home purchase\"\n", - " - Parameters: customer_income, customer_debt, credit_score, loan_amount, loan_type\n", - "\n", - " customer_account_manager - Manage customer accounts and provide banking services\n", - " - Use for: account information, transaction processing, product recommendations, customer service\n", - " - Examples: \"Check balance for checking account 12345\", \"Recommend products for customer with high balance\"\n", - " - Parameters: account_type, customer_id, action, amount, account_details\n", - "\n", - " fraud_detection_system - Analyze transactions for potential fraud and security risks\n", - " - Use for: transaction monitoring, fraud prevention, risk assessment, security alerts\n", - " - Examples: \"Analyze fraud risk for $500 ATM withdrawal in Miami\", \"Check security for $2000 online purchase\"\n", - " - Parameters: transaction_id, customer_id, transaction_amount, transaction_type, location, device_id\n", - "\n", - " BANKING INSTRUCTIONS:\n", - " - Analyze the user's banking request carefully and identify the primary need\n", - " - If they need credit analysis → use credit_risk_analyzer\n", - " - If they need financial calculations → use financial_calculator\n", - " - If they need account services → use customer_account_manager\n", - " - If they need security analysis → use fraud_detection_system\n", - " - Extract relevant parameters from the user's request\n", - " - Provide helpful, accurate banking responses based on tool outputs\n", - " - Always consider banking regulations, risk management, and best practices\n", - " - Be professional and thorough in your analysis\n", - "\n", - " Choose and use tools wisely to provide the most helpful banking assistance.\n", - " \"\"\"\n", - "# Initialize the main LLM for banking responses\n", - "main_llm = ChatOpenAI(\n", - " model=\"gpt-5-mini\",\n", - " reasoning={\n", - " \"effort\": \"low\",\n", - " \"summary\": \"auto\"\n", - " }\n", - ")\n", - "# Bind all banking tools to the main LLM\n", - "llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)\n", - "\n", - "# Banking Agent State Definition\n", - "class BankingAgentState(TypedDict):\n", - " messages: Annotated[Sequence[BaseMessage], add_messages]\n", - " user_input: str\n", - " session_id: str\n", - " context: dict\n", - "\n", - "def create_banking_langgraph_agent():\n", - " \"\"\"Create a comprehensive LangGraph banking agent with intelligent tool selection.\"\"\"\n", - " def llm_node(state: BankingAgentState) -> BankingAgentState:\n", - " \"\"\"Main LLM node that processes banking requests and selects appropriate tools.\"\"\"\n", - " messages = state[\"messages\"]\n", - " # Add system context to messages\n", - " enhanced_messages = [SystemMessage(content=system_context)] + list(messages)\n", - " # Get LLM response with tool selection\n", - " response = llm_with_tools.invoke(enhanced_messages)\n", - " return {\n", - " **state,\n", - " \"messages\": messages + [response]\n", - " }\n", - " \n", - " def should_continue(state: BankingAgentState) -> str:\n", - " \"\"\"Decide whether to use tools or end the conversation.\"\"\"\n", - " last_message = state[\"messages\"][-1]\n", - " # Check if the LLM wants to use tools\n", - " if hasattr(last_message, 'tool_calls') and last_message.tool_calls:\n", - " return \"tools\"\n", - " return END\n", - " \n", - " # Create the banking state graph\n", - " workflow = StateGraph(BankingAgentState)\n", - " # Add nodes\n", - " workflow.add_node(\"llm\", llm_node)\n", - " workflow.add_node(\"tools\", ToolNode(AVAILABLE_TOOLS))\n", - " # Simplified entry point - go directly to LLM\n", - " workflow.add_edge(START, \"llm\")\n", - " # From LLM, decide whether to use tools or end\n", - " workflow.add_conditional_edges(\n", - " \"llm\",\n", - " should_continue,\n", - " {\"tools\": \"tools\", END: END}\n", - " )\n", - " # Tool execution flows back to LLM for final response\n", - " workflow.add_edge(\"tools\", \"llm\")\n", - " # Set up memory\n", - " memory = MemorySaver()\n", - " # Compile the graph\n", - " agent = workflow.compile(checkpointer=memory)\n", - " return agent\n", - "\n", - "# Create the banking intelligent agent\n", - "banking_agent = create_banking_langgraph_agent()\n", - "\n", - "print(\"Banking LangGraph Agent Created Successfully!\")\n", - "print(\"\\nFeatures:\")\n", - "print(\" - Intelligent banking tool selection\")\n", - "print(\" - Comprehensive banking system prompt\")\n", - "print(\" - Streamlined workflow: LLM → Tools → Response\")\n", - "print(\" - Automatic tool parameter extraction\")\n", - "print(\" - Professional banking assistance\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## ValidMind Model Integration\n", - "\n", - "Now we'll integrate our banking LangGraph agent with ValidMind for comprehensive testing and validation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.models import Prompt\n", - "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list\n", - "def banking_agent_fn(input):\n", - " \"\"\"\n", - " Invoke the banking agent with the given input.\n", - " \"\"\"\n", - " try:\n", - " # Initial state for banking agent\n", - " initial_state = {\n", - " \"user_input\": input[\"input\"],\n", - " \"messages\": [HumanMessage(content=input[\"input\"])],\n", - " \"session_id\": input[\"session_id\"],\n", - " \"context\": {}\n", - " }\n", - " session_config = {\"configurable\": {\"thread_id\": input[\"session_id\"]}}\n", - " result = banking_agent.invoke(initial_state, config=session_config)\n", - "\n", - " from utils import capture_tool_output_messages\n", - "\n", - " # Capture all tool outputs and metadata\n", - " captured_data = capture_tool_output_messages(result)\n", - " \n", - " # Access specific tool outputs, this will be used for RAGAS tests\n", - " tool_message = \"\"\n", - " for output in captured_data[\"tool_outputs\"]:\n", - " tool_message += output['content']\n", - " \n", - " tool_calls_found = []\n", - " messages = result['messages']\n", - " for message in messages:\n", - " if hasattr(message, 'tool_calls') and message.tool_calls:\n", - " for tool_call in message.tool_calls:\n", - " # Handle both dictionary and object formats\n", - " if isinstance(tool_call, dict):\n", - " tool_calls_found.append(tool_call['name'])\n", - " else:\n", - " # ToolCall object - use attribute access\n", - " tool_calls_found.append(tool_call.name)\n", - "\n", - "\n", - " return {\n", - " \"prediction\": result['messages'][-1].content[0]['text'],\n", - " \"output\": result,\n", - " \"tool_messages\": [tool_message],\n", - " # \"tool_calls\": tool_calls_found,\n", - " \"tool_called\": _convert_to_tool_call_list(extract_tool_calls_from_agent_output(result))\n", - " }\n", - " except Exception as e:\n", - " # Return a fallback response if the agent fails\n", - " error_message = f\"\"\"I apologize, but I encountered an error while processing your banking request: {str(e)}.\n", - " Please try rephrasing your question or contact support if the issue persists.\"\"\"\n", - " return {\n", - " \"prediction\": error_message, \n", - " \"output\": {\n", - " \"messages\": [HumanMessage(content=input[\"input\"]), SystemMessage(content=error_message)],\n", - " \"error\": str(e)\n", - " }\n", - " }\n", - "\n", - "## Initialize the model\n", - "vm_banking_model = vm.init_model(\n", - " input_id=\"banking_agent_model\",\n", - " predict_fn=banking_agent_fn,\n", - " prompt=Prompt(template=system_context)\n", - ")\n", - "\n", - "# Add the banking agent to the vm model\n", - "vm_banking_model.model = banking_agent\n", - "\n", - "print(\"Banking Agent Successfully Integrated with ValidMind!\")\n", - "print(f\"Model ID: {vm_banking_model.input_id}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Prompt Validation\n", - "\n", - "Let's get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", - "\n", - "- **Clarity**: How clearly the prompt states the task.\n", - "- **Conciseness**: How succinctly the prompt states the task.\n", - "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", - "- **NegativeInstruction**: Whether the prompt contains negative instructions.\n", - "- **Specificity**: How specific the prompt defines the task." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Clarity\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Conciseness\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Delimitation\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.NegativeInstruction\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Specificity\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Banking Test Dataset\n", - "\n", - "We'll use a sample test dataset to evaluate our agent's performance across different banking scenarios.\n", - "\n", - "\n", - "\n", - "### Initialize ValidMind Dataset\n", - "\n", - "Before we can run tests and evaluations, we need to initialize our banking test dataset as a ValidMind dataset object." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Import our banking-specific test dataset\n", - "from banking_test_dataset import banking_test_dataset\n", - "\n", - "vm_test_dataset = vm.init_dataset(\n", - " input_id=\"banking_test_dataset\",\n", - " dataset=banking_test_dataset.sample(2),\n", - " text_column=\"input\",\n", - " target_column=\"possible_outputs\",\n", - ")\n", - "\n", - "print(\"Banking Test Dataset Initialized in ValidMind!\")\n", - "print(f\"Dataset ID: {vm_test_dataset.input_id}\")\n", - "print(f\"Dataset columns: {vm_test_dataset._df.columns}\")\n", - "vm_test_dataset._df.head(1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Run the Agent and capture result through assign predictions\n", - "\n", - "Now we'll execute our banking agent on the test dataset and capture its responses for evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_predictions(vm_banking_model)\n", - "\n", - "print(\"Banking Agent Predictions Generated Successfully!\")\n", - "print(f\"Predictions assigned to {len(vm_test_dataset._df)} test cases\")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Banking Accuracy Test\n", - "\n", - "This test evaluates the banking agent's ability to provide accurate responses by:\n", - "- Testing against a dataset of predefined banking questions and expected answers\n", - "- Checking if responses contain expected keywords and banking terminology\n", - "- Providing detailed test results including pass/fail status\n", - "- Helping identify any gaps in the agent's banking knowledge or response quality" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "@vm.test(\"my_custom_tests.banking_accuracy_test\")\n", - "def banking_accuracy_test(model, dataset, list_of_columns):\n", - " \"\"\"\n", - " The Banking Accuracy Test evaluates whether the agent’s responses include \n", - " critical domain-specific keywords and phrases that indicate accurate, compliant,\n", - " and contextually appropriate banking information. This test ensures that the agent\n", - " provides responses containing the expected banking terminology, risk classifications,\n", - " account details, or other domain-relevant information required for regulatory compliance,\n", - " customer safety, and operational accuracy.\n", - " \"\"\"\n", - " df = dataset._df\n", - " \n", - " # Pre-compute responses for all tests\n", - " y_true = dataset.y.tolist()\n", - " y_pred = dataset.y_pred(model).tolist()\n", - "\n", - " # Vectorized test results\n", - " test_results = []\n", - " for response, keywords in zip(y_pred, y_true):\n", - " # Convert keywords to list if not already a list\n", - " if not isinstance(keywords, list):\n", - " keywords = [keywords]\n", - " test_results.append(any(str(keyword).lower() in str(response).lower() for keyword in keywords))\n", - " \n", - " results = pd.DataFrame()\n", - " column_names = [col + \"_details\" for col in list_of_columns]\n", - " results[column_names] = df[list_of_columns]\n", - " results[\"actual\"] = y_pred\n", - " results[\"expected\"] = y_true\n", - " results[\"passed\"] = test_results\n", - " results[\"error\"] = None if test_results else f'Response did not contain any expected keywords: {y_true}'\n", - " \n", - " return results\n", - " \n", - "result = run_test(\n", - " \"my_custom_tests.banking_accuracy_test\",\n", - " inputs={\n", - " \"dataset\": vm_test_dataset,\n", - " \"model\": vm_banking_model\n", - " },\n", - " params={\n", - " \"list_of_columns\": [\"input\"]\n", - " }\n", - ")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Banking Tool Call Accuracy Test\n", - "\n", - "This test evaluates how accurately our intelligent banking router selects the correct tools for different banking requests. This test provides quantitative feedback on the agent's core intelligence - its ability to understand what users need and select the right banking tools to help them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_custom_tests.BankingToolCallAccuracy\")\n", - "def BankingToolCallAccuracy(dataset, agent_output_column, expected_tools_column):\n", - " \"\"\"\n", - " Evaluates the tool selection accuracy of a LangGraph-powered banking agent.\n", - "\n", - " This test measures whether the agent correctly identifies and invokes the required banking tools\n", - " for each user query scenario.\n", - " For each case, the outputs generated by the agent (including its tool calls) are compared against an\n", - " expected set of tools. The test considers both coverage and exactness: it computes the proportion of\n", - " expected tools correctly called by the agent for each instance.\n", - "\n", - " Parameters:\n", - " dataset (VMDataset): The dataset containing user queries, agent outputs, and ground-truth tool expectations.\n", - " agent_output_column (str): Dataset column name containing agent outputs (should include tool call details in 'messages').\n", - " expected_tools_column (str): Dataset column specifying the true expected tools (as lists).\n", - "\n", - " Returns:\n", - " List[dict]: Per-row dictionaries with details: expected tools, found tools, match count, total expected, and accuracy score.\n", - "\n", - " Purpose:\n", - " Provides diagnostic evidence of the banking agent's core reasoning ability—specifically, its capacity to\n", - " interpret user needs and select the correct banking actions. Useful for diagnosing gaps in tool coverage,\n", - " misclassifications, or breakdowns in agent logic.\n", - "\n", - " Interpretation:\n", - " - An accuracy of 1.0 signals perfect tool selection for that example.\n", - " - Lower scores may indicate partial or complete failures to invoke required tools.\n", - " - Review 'found_tools' vs. 'expected_tools' to understand the source of discrepancies.\n", - "\n", - " Strengths:\n", - " - Directly tests a core capability of compositional tool-use agents.\n", - " - Framework-agnostic; robust to tool call output format (object or dict).\n", - " - Supports batch validation and result logging for systematic documentation.\n", - "\n", - " Limitations:\n", - " - Does not penalize extra, unnecessary tool calls.\n", - " - Does not assess result quality—only correct invocation.\n", - "\n", - " \"\"\"\n", - " def validate_tool_calls_simple(messages, expected_tools):\n", - " \"\"\"Simple validation of tool calls without RAGAS dependency issues.\"\"\"\n", - " \n", - " tool_calls_found = []\n", - " \n", - " for message in messages:\n", - " if hasattr(message, 'tool_calls') and message.tool_calls:\n", - " for tool_call in message.tool_calls:\n", - " # Handle both dictionary and object formats\n", - " if isinstance(tool_call, dict):\n", - " tool_calls_found.append(tool_call['name'])\n", - " else:\n", - " # ToolCall object - use attribute access\n", - " tool_calls_found.append(tool_call.name)\n", - " \n", - " # Check if expected tools were called\n", - " accuracy = 0.0\n", - " matches = 0\n", - " if expected_tools:\n", - " matches = sum(1 for tool in expected_tools if tool in tool_calls_found)\n", - " accuracy = matches / len(expected_tools)\n", - " \n", - " return {\n", - " 'expected_tools': expected_tools,\n", - " 'found_tools': tool_calls_found,\n", - " 'matches': matches,\n", - " 'total_expected': len(expected_tools) if expected_tools else 0,\n", - " 'accuracy': accuracy,\n", - " }\n", - "\n", - " df = dataset._df\n", - " \n", - " results = []\n", - " for i, row in df.iterrows():\n", - " result = validate_tool_calls_simple(row[agent_output_column]['messages'], row[expected_tools_column])\n", - " results.append(result)\n", - " \n", - " return results\n", - "\n", - "run_test(\n", - " \"my_custom_tests.BankingToolCallAccuracy\",\n", - " inputs = {\n", - " \"dataset\": vm_test_dataset,\n", - " },\n", - " params = {\n", - " \"agent_output_column\": \"banking_agent_model_output\",\n", - " \"expected_tools_column\": \"expected_tools\"\n", - " }\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Scorers in ValidMind\n", - "\n", - "Scorers are evaluation metrics that analyze model outputs and store their results in the dataset. When using `assign_scores()`:\n", - "\n", - "- Each scorer adds a new column to the dataset with format: {scorer_name}_{metric_name}\n", - "- The column contains the numeric score (typically 0-1) for each example\n", - "- Multiple scorers can be run on the same dataset, each adding their own column\n", - "- Scores are persisted in the dataset for later analysis and visualization\n", - "- Common scorer patterns include:\n", - " - Model performance metrics (accuracy, F1, etc)\n", - " - Output quality metrics (relevance, faithfulness)\n", - " - Task-specific metrics (completion, correctness)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### AI Agent Evaluation Metrics\n", - "\n", - "AI agent evaluation metrics are specialized measurements designed to assess how well autonomous LLM-based agents reason, plan, select and execute tools, and ultimately complete user tasks by analyzing the **full execution trace**—including reasoning steps, tool calls, intermediate decisions, and outcomes—rather than just single input–output pairs.\n", - "\n", - "These metrics are essential because agent failures often occur in ways traditional LLM metrics miss (e.g., choosing the right tool with wrong arguments, creating a good plan but not following it, or completing a task inefficiently).\n", - "\n", - "**DeepEval’s AI agent evaluation framework** breaks evaluation into three layers with corresponding metric categories:\n", - "\n", - "1. **Reasoning Layer** – Evaluates planning and strategy generation:\n", - "\n", - " * *PlanQualityMetric* – how logical, complete, and efficient the agent’s plan is\n", - " * *PlanAdherenceMetric* – whether the agent follows its own plan during execution \n", - "\n", - "2. **Action Layer** – Assesses tool usage and argument generation:\n", - "\n", - " * *ToolCorrectnessMetric* – whether the agent selects and calls the right tools\n", - " * *ArgumentCorrectnessMetric* – whether the agent generates correct tool arguments\n", - "\n", - "3. **Execution Layer** – Measures end-to-end performance:\n", - "\n", - " * *TaskCompletionMetric* – whether the agent successfully completes the intended task\n", - " * *StepEfficiencyMetric* – whether the agent avoids unnecessary or redundant steps\n", - "\n", - "Together, these metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### **Reasoning Layer**\n", - "#### PlanQualityMetric\n", - "Let's measures how well the agent generates a plan before acting. A high score means the plan is logical, complete, and efficient." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.PlanQuality\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### PlanAdherenceMetric\n", - "Let's checks whether the agent follows the plan it created. Deviations lower this score and indicate gaps between reasoning and execution." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.PlanAdherence\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " expected_output_column = \"expected_output\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### **Action Layer**\n", - "#### ToolCorrectnessMetric\n", - "Let's evaluates if the agent selects the appropriate tool for the task. Choosing the wrong tool reduces performance even if reasoning was correct." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.ToolCorrectness\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - " expected_tools_column = \"expected_tools\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### ArgumentCorrectnessMetric\n", - "Let's assesses whether the agent provides correct inputs or arguments to the selected tool. Incorrect arguments can lead to failed or unexpected results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.ArgumentCorrectness\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### **Execution Layer**\n", - "#### TaskCompletionMetric\n", - "The TaskCompletion test evaluates whether our banking agent successfully completes the requested tasks by analyzing its outputs and tool usage. This metric assesses the agent's ability to understand user requests, execute appropriate actions, and provide complete responses that address the original query. The test provides a score between 0-1 along with detailed feedback on task completion quality." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_dataset.assign_scores(\n", - " metrics = \"validmind.scorers.llm.deepeval.TaskCompletion\",\n", - " input_column = \"input\",\n", - " actual_output_column = \"banking_agent_model_prediction\",\n", - " agent_output_column = \"banking_agent_model_output\",\n", - " tools_called_column = \"banking_agent_model_tool_called\",\n", - "\n", - ")\n", - "vm_test_dataset._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The TaskCompletion scorer has added a new column 'TaskCompletion_score' to our dataset. This is because when we run scorers through assign_scores(), the return values are automatically processed and added as new columns with the format {scorer_name}_{metric_name}. We'll use this column to visualize the distribution of task completion scores across our test cases. Let's visualize the distribution through the box plot test." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.plots.BoxPlot\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " params={\n", - " \"columns\": \"TaskCompletion_score\",\n", - " \"title\": \"Distribution of Task Completion Scores\",\n", - " \"ylabel\": \"Score\",\n", - " \"figsize\": (8, 6)\n", - " }\n", - ").log()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## RAGAS Tests for an Agent Evaluation\n", - "\n", - "RAGAS (Retrieval-Augmented Generation Assessment) provides specialized metrics for evaluating conversational AI systems like our banking agent. These tests analyze different aspects of agent performance:\n", - "\n", - "Our banking agent uses tools to retrieve information and generates responses based on that context, making it similar to a RAG system. RAGAS metrics help evaluate:\n", - "\n", - "- **Response Quality**: How well the agent uses retrieved tool outputs to generate helpful banking responses\n", - "- **Information Faithfulness**: Whether agent responses accurately reflect tool outputs \n", - "- **Relevance Assessment**: How well responses address the original banking query\n", - "- **Context Utilization**: How effectively the agent incorporates tool results into final answers\n", - "\n", - "These tests provide insights into how well our banking agent integrates tool usage with conversational abilities, ensuring it provides accurate, relevant, and helpful responses to banking users." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Faithfulness\n", - "\n", - "Faithfulness measures how accurately the banking agent's responses reflect the information retrieved from tools. This metric evaluates:\n", - "\n", - "**Information Accuracy**: Whether the agent correctly uses tool outputs in its responses\n", - "- **Fact Preservation**: Ensuring credit scores, loan calculations, compliance results are accurately reported\n", - "- **No Hallucination**: Verifying the agent doesn't invent banking information not provided by tools\n", - "- **Source Attribution**: Checking that responses align with actual tool outputs\n", - "\n", - "**Critical for Banking Trust**: Faithfulness is essential for banking agent reliability because users need to trust that:\n", - "- Credit analysis results are reported correctly\n", - "- Financial calculations are accurate \n", - "- Compliance checks return real information\n", - "- Risk assessments are properly communicated" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.Faithfulness\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " param_grid={\n", - " \"user_input_column\": [\"input\"],\n", - " \"response_column\": [\"banking_agent_model_prediction\"],\n", - " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Response Relevancy\n", - "\n", - "Response Relevancy evaluates how well the banking agent's answers address the user's original banking question or request. This metric assesses:\n", - "\n", - "**Query Alignment**: Whether responses directly answer what users asked for\n", - "- **Intent Fulfillment**: Checking if the agent understood and addressed the user's actual banking need\n", - "- **Completeness**: Ensuring responses provide sufficient information to satisfy the banking query\n", - "- **Focus**: Avoiding irrelevant information that doesn't help the banking user\n", - "\n", - "**Banking Quality**: Measures the agent's ability to maintain relevant, helpful banking dialogue\n", - "- **Context Awareness**: Responses should be appropriate for the banking conversation context\n", - "- **User Satisfaction**: Answers should be useful and actionable for banking users\n", - "- **Clarity**: Banking information should be presented in a way that directly helps the user\n", - "\n", - "High relevancy indicates the banking agent successfully understands user needs and provides targeted, helpful banking responses." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " params={\n", - " \"user_input_column\": \"input\",\n", - " \"response_column\": \"banking_agent_model_prediction\",\n", - " \"retrieved_contexts_column\": \"banking_agent_model_tool_messages\",\n", - " }\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Recall\n", - "\n", - "Context Recall measures how well the banking agent utilizes the information retrieved from tools when generating its responses. This metric evaluates:\n", - "\n", - "**Information Utilization**: Whether the agent effectively incorporates tool outputs into its responses\n", - "- **Coverage**: How much of the available tool information is used in the response\n", - "- **Integration**: How well tool outputs are woven into coherent, natural banking responses\n", - "- **Completeness**: Whether all relevant information from tools is considered\n", - "\n", - "**Tool Effectiveness**: Assesses whether selected banking tools provide useful context for responses\n", - "- **Relevance**: Whether tool outputs actually help answer the user's banking question\n", - "- **Sufficiency**: Whether enough information was retrieved to generate good banking responses\n", - "- **Quality**: Whether the tools provided accurate, helpful banking information\n", - "\n", - "High context recall indicates the banking agent not only selects the right tools but also effectively uses their outputs to create comprehensive, well-informed banking responses." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextRecall\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " param_grid={\n", - " \"user_input_column\": [\"input\"],\n", - " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", - " \"reference_column\": [\"banking_agent_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Safety\n", - "\n", - "Safety testing is critical for banking AI agents to ensure they operate reliably and securely.\n", - "These tests help validate that our banking agent maintains high standards of fairness and professionalism." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### AspectCritic\n", - "\n", - "AspectCritic provides comprehensive evaluation across multiple dimensions of banking agent performance. This metric analyzes various aspects of response quality:\n", - "\n", - "**Multi-Dimensional Assessment**: Evaluates responses across different quality criteria:\n", - " - **Conciseness**: Whether responses are clear and to-the-point without unnecessary details\n", - " - **Coherence**: Whether responses are logically structured and easy to follow\n", - " - **Correctness**: Accuracy of banking information and appropriateness of recommendations\n", - " - **Harmfulness**: Whether responses could cause harm or damage to users or systems\n", - " - **Maliciousness**: Whether responses contain malicious content or intent\n", - "\n", - "**Holistic Quality Scoring**: Provides an overall assessment that considers:\n", - "- **User Experience**: How satisfying and useful the banking interaction would be for real users\n", - "- **Professional Standards**: Whether responses meet quality expectations for production banking systems\n", - "- **Consistency**: Whether the banking agent maintains quality across different types of requests\n", - "\n", - "AspectCritic helps identify specific areas where the banking agent excels or needs improvement, providing actionable insights for enhancing overall performance and user satisfaction in banking scenarios." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.AspectCritic\",\n", - " inputs={\"dataset\": vm_test_dataset},\n", - " param_grid={\n", - " \"user_input_column\": [\"input\"],\n", - " \"response_column\": [\"banking_agent_model_prediction\"],\n", - " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Prompt bias\n", - "\n", - "Let's check if the agent's prompts contain unintended biases that could affect banking decisions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Bias\",\n", - " inputs={\n", - " \"model\": vm_banking_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Toxicity\n", - "\n", - "Let's ensure responses are professional and appropriate for banking contexts." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.Toxicity\",\n", - " inputs={\n", - " \"dataset\": vm_test_dataset,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Demo Summary and Next Steps\n", - "\n", - "We have successfully built and tested a comprehensive **Banking AI Agent** using LangGraph and ValidMind. Here's what we've accomplished:\n", - "\n", - "\n", - "\n", - "### What We Built\n", - "\n", - "1. **5 Specialized Banking Tools**\n", - " - Credit Risk Analyzer for loan assessments\n", - " - Customer Account Manager for account services\n", - " - Fraud Detection System for security monitoring\n", - "\n", - "2. **Intelligent LangGraph Agent**\n", - " - Automatic tool selection based on user requests\n", - " - Banking-specific system prompts and guidance\n", - " - Professional banking assistance and responses\n", - "\n", - "3. **Comprehensive Testing Framework**\n", - " - banking-specific test cases\n", - " - ValidMind integration for validation\n", - " - Performance analysis across banking domains\n", - "\n", - "\n", - "\n", - "### Next Steps\n", - "\n", - "1. **Customize Tools**: Adapt the banking tools to your specific banking requirements\n", - "2. **Expand Test Cases**: Add more banking scenarios and edge cases\n", - "3. **Integrate with Real Data**: Connect to actual banking systems and databases\n", - "4. **Add More Tools**: Implement additional banking-specific functionality\n", - "5. **Production Deployment**: Deploy the agent in a production banking environment\n", - "\n", - "\n", - "\n", - "### Key Benefits\n", - "\n", - "- **Industry-Specific**: Designed specifically for banking operations\n", - "- **Regulatory Compliance**: Built-in SR 11-7 and SS 1-23 compliance checks\n", - "- **Risk Management**: Comprehensive credit and fraud risk assessment\n", - "- **Customer Focus**: Tools for both retail and commercial banking needs\n", - "- **Real-World Applicability**: Addresses actual banking use cases and challenges\n", - "\n", - "Your banking AI agent is now ready to handle real-world banking scenarios while maintaining regulatory compliance and risk management best practices!" - ] - }, - { - "cell_type": "markdown", - "id": "copyright-e7184e5605bb4f85b3d7b8306aaaef78", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ValidMind (Poetry)", - "language": "python", - "name": "validmind" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models_quantlib.ipynb b/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models_quantlib.ipynb deleted file mode 100644 index 1269254add..0000000000 --- a/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models_quantlib.ipynb +++ /dev/null @@ -1,1330 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "1e2a4689", - "metadata": {}, - "source": [ - "# Quickstart for Heston option pricing model using QuantLib\n", - "\n", - "Welcome! Let's get you started with the basic process of documenting models with ValidMind.\n", - "\n", - "The Heston option pricing model is a popular stochastic volatility model used to price options. Developed by Steven Heston in 1993, the model assumes that the asset's volatility follows a mean-reverting square-root process, allowing it to capture the empirical observation of volatility \"clustering\" in financial markets. This model is particularly useful for assets where volatility is not constant, making it a favored approach in quantitative finance for pricing complex derivatives.\n", - "\n", - "Here’s an overview of the Heston model as implemented in QuantLib, a powerful library for quantitative finance:\n", - "\n", - "\n", - "\n", - "### Model Assumptions and Characteristics\n", - "1. **Stochastic Volatility**: The volatility is modeled as a stochastic process, following a mean-reverting square-root process (Cox-Ingersoll-Ross process).\n", - "2. **Correlated Asset and Volatility Processes**: The asset price and volatility are assumed to be correlated, allowing the model to capture the \"smile\" effect observed in implied volatilities.\n", - "3. **Risk-Neutral Dynamics**: The Heston model is typically calibrated under a risk-neutral measure, which allows for direct application to pricing.\n", - "\n", - "\n", - "\n", - "### Heston Model Parameters\n", - "The model is governed by a set of key parameters:\n", - "- **S0**: Initial stock price\n", - "- **v0**: Initial variance of the asset price\n", - "- **kappa**: Speed of mean reversion of the variance\n", - "- **theta**: Long-term mean level of variance\n", - "- **sigma**: Volatility of volatility (vol of vol)\n", - "- **rho**: Correlation between the asset price and variance processes\n", - "\n", - "The dynamics of the asset price \\( S \\) and the variance \\( v \\) under the Heston model are given by:\n", - "\n", - "$$\n", - "dS_t = r S_t \\, dt + \\sqrt{v_t} S_t \\, dW^S_t\n", - "$$\n", - "\n", - "$$\n", - "dv_t = \\kappa (\\theta - v_t) \\, dt + \\sigma \\sqrt{v_t} \\, dW^v_t\n", - "$$\n", - "\n", - "where \\( $dW^S$ \\) and \\( $dW^v$ \\) are Wiener processes with correlation \\( $\\rho$ \\).\n", - "\n", - "\n", - "\n", - "### Advantages and Limitations\n", - "- **Advantages**:\n", - " - Ability to capture volatility smiles and skews.\n", - " - More realistic pricing for options on assets with stochastic volatility.\n", - "- **Limitations**:\n", - " - Calibration can be complex due to the number of parameters.\n", - " - Computationally intensive compared to simpler models like Black-Scholes.\n", - "\n", - "This setup provides a robust framework for pricing and analyzing options with stochastic volatility dynamics. QuantLib’s implementation makes it easy to experiment with different parameter configurations and observe their effects on pricing.\n", - "\n", - "You will learn how to initialize the ValidMind Library, develop a option pricing model, and then write custom tests that can be used for sensitivity and stress testing to quickly generate documentation about model." - ] - }, - { - "cell_type": "markdown", - "id": "69ec219a", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - " - [Model Assumptions and Characteristics](#toc1_1__) \n", - " - [Heston Model Parameters](#toc1_2__) \n", - " - [Advantages and Limitations](#toc1_3__) \n", - "- [About ValidMind](#toc2__) \n", - " - [Before you begin](#toc2_1__) \n", - " - [New to ValidMind?](#toc2_2__) \n", - " - [Key concepts](#toc2_3__) \n", - "- [Setting up](#toc3__) \n", - " - [Install the ValidMind Library](#toc3_1__) \n", - " - [Initialize the ValidMind Library](#toc3_2__) \n", - " - [Register sample model](#toc3_2_1__) \n", - " - [Apply documentation template](#toc3_2_2__) \n", - " - [Get your code snippet](#toc3_2_3__) \n", - " - [Initialize the Python environment](#toc3_3__) \n", - " - [Preview the documentation template](#toc3_4__) \n", - "- [Data Preparation](#toc4__) \n", - " - [Helper functions](#toc4_1_1__) \n", - " - [Market Data Quality and Availability](#toc4_2__) \n", - " - [Initialize the ValidMind datasets](#toc4_3__) \n", - " - [Data Quality](#toc4_4__) \n", - " - [Isolation Forest Outliers Test](#toc4_4_1__) \n", - " - [Model parameters](#toc4_4_2__) \n", - "- [Model development - Heston Option price](#toc5__) \n", - " - [Model Calibration](#toc5_1__) \n", - " - [Model Evaluation](#toc5_2__) \n", - " - [Benchmark Testing](#toc5_2_1__) \n", - " - [Sensitivity Testing](#toc5_2_2__) \n", - " - [Stress Testing](#toc5_2_3__) \n", - "- [Next steps](#toc6__) \n", - " - [Work with your model documentation](#toc6_1__) \n", - " - [Discover more learning resources](#toc6_2__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "id": "b9fb5d17", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
\n", - "\n", - "\n", - "\n", - "### Key concepts\n", - "\n", - "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", - "\n", - "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", - "\n", - "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." - ] - }, - { - "cell_type": "markdown", - "id": "f2dccf35", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "id": "5a5ce085", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Install the ValidMind Library\n", - "\n", - "To install the library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "409352bf", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q validmind" - ] - }, - { - "cell_type": "markdown", - "id": "65e870b2", - "metadata": {}, - "source": [ - "To install the QuantLib library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a34debf", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q QuantLib" - ] - }, - { - "cell_type": "markdown", - "id": "fb30ae07", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "id": "c6f87017", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "id": "cbb2e2c9", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Capital markets`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "id": "2012eb82", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0cd3f67e", - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "6d944cc9", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the Python environment\n", - "\n", - "Next, let's import the necessary libraries and set up your Python environment for data analysis:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8cf2746", - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "\n", - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "from scipy.optimize import minimize\n", - "import yfinance as yf\n", - "import QuantLib as ql\n", - "from validmind.tests import run_test" - ] - }, - { - "cell_type": "markdown", - "id": "bc431ee0", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Preview the documentation template\n", - "\n", - "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", - "\n", - "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e844028", - "metadata": {}, - "outputs": [], - "source": [ - "vm.preview_template()" - ] - }, - { - "cell_type": "markdown", - "id": "0c0ee8b9", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Data Preparation" - ] - }, - { - "cell_type": "markdown", - "id": "5a4d2c36", - "metadata": {}, - "source": [ - "### Market Data Sources\n", - "\n", - "\n", - "\n", - "#### Helper functions\n", - "Let's define helper function retrieve to option data from Yahoo Finance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b96a500f", - "metadata": {}, - "outputs": [], - "source": [ - "def get_market_data(ticker, expiration_date_str):\n", - " \"\"\"\n", - " Fetch option market data from Yahoo Finance for the given ticker and expiration date.\n", - " Returns a list of tuples: (strike, maturity, option_price).\n", - " \"\"\"\n", - " # Create a Ticker object for the specified stock\n", - " stock = yf.Ticker(ticker)\n", - "\n", - " # Get all available expiration dates for options\n", - " option_dates = stock.options\n", - "\n", - " # Check if the requested expiration date is available\n", - " if expiration_date_str not in option_dates:\n", - " raise ValueError(f\"Expiration date {expiration_date_str} not available for {ticker}. Available dates: {option_dates}\")\n", - "\n", - " # Get the option chain for the specified expiration date\n", - " option_chain = stock.option_chain(expiration_date_str)\n", - "\n", - " # Get call options (or you can use puts as well based on your requirement)\n", - " calls = option_chain.calls\n", - "\n", - " # Convert expiration_date_str to QuantLib Date\n", - " expiry_date_parts = list(map(int, expiration_date_str.split('-'))) # Split YYYY-MM-DD\n", - " maturity_date = ql.Date(expiry_date_parts[2], expiry_date_parts[1], expiry_date_parts[0]) # Convert to QuantLib Date\n", - "\n", - " # Create a list to store strike prices, maturity dates, and option prices\n", - " market_data = []\n", - " for index, row in calls.iterrows():\n", - " strike = row['strike']\n", - " option_price = row['lastPrice'] # You can also use 'bid', 'ask', 'mid', etc.\n", - " market_data.append((strike, maturity_date, option_price))\n", - " df = pd.DataFrame(market_data, columns = ['strike', 'maturity_date', 'option_price'])\n", - " return df" - ] - }, - { - "cell_type": "markdown", - "id": "c7769b73", - "metadata": {}, - "source": [ - "Let's define helper function retrieve to stock data from Yahoo Finance. This helper function to calculate spot price, dividend yield, volatility and risk free rate using the underline stock data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc44c448", - "metadata": {}, - "outputs": [], - "source": [ - "def get_option_parameters(ticker):\n", - " # Fetch historical data for the stock\n", - " stock_data = yf.Ticker(ticker)\n", - " \n", - " # Get the current spot price\n", - " spot_price = stock_data.history(period=\"1d\")['Close'].iloc[-1]\n", - " \n", - " # Get dividend yield\n", - " dividend_rate = stock_data.dividends.mean() / spot_price if not stock_data.dividends.empty else 0.0\n", - " \n", - " # Estimate volatility (standard deviation of log returns)\n", - " hist_data = stock_data.history(period=\"1y\")['Close']\n", - " log_returns = np.log(hist_data / hist_data.shift(1)).dropna()\n", - " volatility = np.std(log_returns) * np.sqrt(252) # Annualized volatility\n", - " \n", - " # Assume a risk-free rate from some known data (can be fetched from market data, here we use 0.001)\n", - " risk_free_rate = 0.001\n", - " \n", - " # Return the calculated parameters\n", - " return {\n", - " \"spot_price\": spot_price,\n", - " \"volatility\": volatility,\n", - " \"dividend_rate\": dividend_rate,\n", - " \"risk_free_rate\": risk_free_rate\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "c7b739d3", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Market Data Quality and Availability\n", - "Next, let's specify ticker and expiration date to get market data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50225fde", - "metadata": {}, - "outputs": [], - "source": [ - "ticker = \"MSFT\"\n", - "expiration_date = \"2024-12-13\" # Example expiration date in 'YYYY-MM-DD' form\n", - "\n", - "market_data = get_market_data(ticker=ticker, expiration_date_str=expiration_date)" - ] - }, - { - "cell_type": "markdown", - "id": "c539b95e", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind datasets\n", - "\n", - "Before you can run tests, you must first initialize a ValidMind dataset object using the [`init_dataset`](https://docs.validmind.ai/validmind/validmind.html#init_dataset) function from the ValidMind (`vm`) module." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "113f9c17", - "metadata": {}, - "outputs": [], - "source": [ - "vm_market_data = vm.init_dataset(\n", - " dataset=market_data,\n", - " input_id=\"market_data\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "185beb24", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Data Quality\n", - "Let's check quality of the data using outliers and missing data tests." - ] - }, - { - "cell_type": "markdown", - "id": "7f14464c", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Isolation Forest Outliers Test\n", - "Let's detects anomalies in the dataset using the Isolation Forest algorithm, visualized through scatter plots." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "56c919ec", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"validmind.data_validation.IsolationForestOutliers\",\n", - " inputs={\n", - " \"dataset\": vm_market_data,\n", - " },\n", - " title=\"Outliers detection using Isolation Forest\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "e4d0e5ca", - "metadata": {}, - "source": [ - "##### Missing Values Test\n", - "Let's evaluates dataset quality by ensuring the missing value ratio across all features does not exceed a set threshold." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e95c825f", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"validmind.data_validation.MissingValues\",\n", - " inputs={\n", - " \"dataset\": vm_market_data,\n", - " },\n", - " title=\"Missing Values detection\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "829403a3", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Model parameters\n", - "Let's calculate the model parameters using from stock data " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25936449", - "metadata": {}, - "outputs": [], - "source": [ - "option_params = get_option_parameters(ticker=ticker)" - ] - }, - { - "cell_type": "markdown", - "id": "0a0948b6", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Model development - Heston Option price" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e15b8221", - "metadata": {}, - "outputs": [], - "source": [ - "class HestonModel:\n", - "\n", - " def __init__(self, ticker, expiration_date_str, calculation_date, spot_price, dividend_rate, risk_free_rate):\n", - " self.ticker = ticker\n", - " self.expiration_date_str = expiration_date_str,\n", - " self.calculation_date = calculation_date\n", - " self.spot_price = spot_price\n", - " self.dividend_rate = dividend_rate\n", - " self.risk_free_rate = risk_free_rate\n", - " \n", - " def predict_option_price(self, strike, maturity_date, spot_price, v0=None, theta=None, kappa=None, sigma=None, rho=None):\n", - " # Set the evaluation date\n", - " ql.Settings.instance().evaluationDate = self.calculation_date\n", - "\n", - " # Construct the European Option\n", - " payoff = ql.PlainVanillaPayoff(ql.Option.Call, strike)\n", - " exercise = ql.EuropeanExercise(maturity_date)\n", - " european_option = ql.VanillaOption(payoff, exercise)\n", - "\n", - " # Yield term structures for risk-free rate and dividend\n", - " riskFreeTS = ql.YieldTermStructureHandle(ql.FlatForward(calculation_date, self.risk_free_rate, ql.Actual365Fixed()))\n", - " dividendTS = ql.YieldTermStructureHandle(ql.FlatForward(calculation_date, self.dividend_rate, ql.Actual365Fixed()))\n", - "\n", - " # Initial stock price\n", - " initialValue = ql.QuoteHandle(ql.SimpleQuote(spot_price))\n", - "\n", - " # Heston process parameters\n", - " heston_process = ql.HestonProcess(riskFreeTS, dividendTS, initialValue, v0, kappa, theta, sigma, rho)\n", - " hestonModel = ql.HestonModel(heston_process)\n", - "\n", - " # Use the Heston analytic engine\n", - " engine = ql.AnalyticHestonEngine(hestonModel)\n", - " european_option.setPricingEngine(engine)\n", - "\n", - " # Calculate the Heston model price\n", - " h_price = european_option.NPV()\n", - "\n", - " return h_price\n", - "\n", - " def predict_american_option_price(self, strike, maturity_date, spot_price, v0=None, theta=None, kappa=None, sigma=None, rho=None):\n", - " # Set the evaluation date\n", - " ql.Settings.instance().evaluationDate = self.calculation_date\n", - "\n", - " # Construct the American Option\n", - " payoff = ql.PlainVanillaPayoff(ql.Option.Call, strike)\n", - " exercise = ql.AmericanExercise(self.calculation_date, maturity_date)\n", - " american_option = ql.VanillaOption(payoff, exercise)\n", - "\n", - " # Yield term structures for risk-free rate and dividend\n", - " riskFreeTS = ql.YieldTermStructureHandle(ql.FlatForward(self.calculation_date, self.risk_free_rate, ql.Actual365Fixed()))\n", - " dividendTS = ql.YieldTermStructureHandle(ql.FlatForward(self.calculation_date, self.dividend_rate, ql.Actual365Fixed()))\n", - "\n", - " # Initial stock price\n", - " initialValue = ql.QuoteHandle(ql.SimpleQuote(spot_price))\n", - "\n", - " # Heston process parameters\n", - " heston_process = ql.HestonProcess(riskFreeTS, dividendTS, initialValue, v0, kappa, theta, sigma, rho)\n", - " heston_model = ql.HestonModel(heston_process)\n", - "\n", - "\n", - " payoff = ql.PlainVanillaPayoff(ql.Option.Call, strike)\n", - " exercise = ql.AmericanExercise(self.calculation_date, maturity_date)\n", - " american_option = ql.VanillaOption(payoff, exercise)\n", - " heston_fd_engine = ql.FdHestonVanillaEngine(heston_model)\n", - " american_option.setPricingEngine(heston_fd_engine)\n", - " option_price = american_option.NPV()\n", - "\n", - " return option_price\n", - "\n", - " def objective_function(self, params, market_data, spot_price, dividend_rate, risk_free_rate):\n", - " v0, theta, kappa, sigma, rho = params\n", - "\n", - " # Sum of squared differences between market prices and model prices\n", - " error = 0.0\n", - " for i, row in market_data.iterrows():\n", - " model_price = self.predict_option_price(row['strike'], row['maturity_date'], spot_price, \n", - " v0, theta, kappa, sigma, rho)\n", - " error += (model_price - row['option_price']) ** 2\n", - " \n", - " return error\n", - "\n", - " def calibrate_model(self, ticker, expiration_date_str):\n", - " # Get the option market data dynamically from Yahoo Finance\n", - " market_data = get_market_data(ticker, expiration_date_str)\n", - "\n", - " # Initial guesses for Heston parameters\n", - " initial_params = [0.04, 0.04, 0.1, 0.1, -0.75]\n", - "\n", - " # Bounds for the parameters to ensure realistic values\n", - " bounds = [(0.0001, 1.0), # v0\n", - " (0.0001, 1.0), # theta\n", - " (0.001, 2.0), # kappa\n", - " (0.001, 1.0), # sigma\n", - " (-0.75, 0.0)] # rho\n", - "\n", - " # Optimize the parameters to minimize the error between model and market prices\n", - " result = minimize(self.objective_function, initial_params, args=(market_data, self.spot_price, self.dividend_rate, self.risk_free_rate),\n", - " bounds=bounds, method='L-BFGS-B')\n", - "\n", - " # Optimized Heston parameters\n", - " v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt = result.x\n", - "\n", - " return v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt\n" - ] - }, - { - "cell_type": "markdown", - "id": "a941aa32", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Model Calibration\n", - "* The calibration process aims to optimize the Heston model parameters (v0, theta, kappa, sigma, rho) by minimizing the difference between model-predicted option prices and observed market prices.\n", - "* In this implementation, the model is calibrated to current market data, specifically using option prices from the selected ticker and expiration date.\n", - "\n", - "Let's specify `calculation_date` and `strike_price` as input parameters for the model to verify its functionality and confirm it operates as expected." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d61dfca", - "metadata": {}, - "outputs": [], - "source": [ - "calculation_date = ql.Date(26, 11, 2024)\n", - "# Convert expiration date string to QuantLib.Date\n", - "expiry_date_parts = list(map(int, expiration_date.split('-')))\n", - "maturity_date = ql.Date(expiry_date_parts[2], expiry_date_parts[1], expiry_date_parts[0])\n", - "strike_price = 460.0\n", - "\n", - "hm = HestonModel(\n", - " ticker=ticker,\n", - " expiration_date_str= expiration_date,\n", - " calculation_date= calculation_date,\n", - " spot_price= option_params['spot_price'],\n", - " dividend_rate = option_params['dividend_rate'],\n", - " risk_free_rate = option_params['risk_free_rate']\n", - ")\n", - "\n", - "# Let's calibrate model\n", - "v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt = hm.calibrate_model(ticker, expiration_date)\n", - "print(f\"Optimized Heston parameters: v0={v0_opt}, theta={theta_opt}, kappa={kappa_opt}, sigma={sigma_opt}, rho={rho_opt}\")\n", - "\n", - "\n", - "# option price\n", - "h_price = hm.predict_option_price(strike_price, maturity_date, option_params['spot_price'], v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt)\n", - "print(\"The Heston model price for the option is:\", h_price)" - ] - }, - { - "cell_type": "markdown", - "id": "75313272", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Model Evaluation" - ] - }, - { - "cell_type": "markdown", - "id": "2e6471ef", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Benchmark Testing\n", - "The benchmark testing framework provides a robust way to validate the Heston model implementation and understand the relationships between European and American option prices under stochastic volatility conditions.\n", - "Let's compares European and American option prices using the Heston model." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "810cf887", - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_custom_tests.BenchmarkTest\")\n", - "def benchmark_test(hm_model, strikes, maturity_date, spot_price, v0=None, theta=None, kappa=None, sigma=None, rho=None):\n", - " \"\"\"\n", - " Compares European and American option prices using the Heston model.\n", - "\n", - " This test evaluates the price differences between European and American options\n", - " across multiple strike prices while keeping other parameters constant. The comparison\n", - " helps understand the early exercise premium of American options over their European\n", - " counterparts under stochastic volatility conditions.\n", - "\n", - " Args:\n", - " hm_model: HestonModel instance for option pricing calculations\n", - " strikes (list[float]): List of strike prices to test\n", - " maturity_date (ql.Date): Option expiration date in QuantLib format\n", - " spot_price (float): Current price of the underlying asset\n", - " v0 (float, optional): Initial variance. Defaults to None.\n", - " theta (float, optional): Long-term variance. Defaults to None.\n", - " kappa (float, optional): Mean reversion rate. Defaults to None.\n", - " sigma (float, optional): Volatility of variance. Defaults to None.\n", - " rho (float, optional): Correlation between asset and variance. Defaults to None.\n", - "\n", - " Returns:\n", - " dict: Contains a DataFrame with the following columns:\n", - " - Strike: Strike prices tested\n", - " - Maturity date: Expiration date for all options\n", - " - Spot price: Current underlying price\n", - " - european model price: Prices for European options\n", - " - american model price: Prices for American options\n", - "\"\"\"\n", - " american_derived_prices = []\n", - " european_derived_prices = []\n", - " for K in strikes:\n", - " european_derived_prices.append(hm_model.predict_option_price(K, maturity_date, spot_price, v0, theta, kappa, sigma, rho))\n", - " american_derived_prices.append(hm_model.predict_american_option_price(K, maturity_date, spot_price, v0, theta, kappa, sigma, rho))\n", - "\n", - " data = {\n", - " \"Strike\": strikes,\n", - " \"Maturity date\": [maturity_date] * len(strikes),\n", - " \"Spot price\": [spot_price] * len(strikes),\n", - " \"european model price\": european_derived_prices,\n", - " \"american model price\": american_derived_prices,\n", - "\n", - " }\n", - " df1 = pd.DataFrame(data)\n", - " return {\"strikes variation benchmarking\": df1}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3fdd6705", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.BenchmarkTest\",\n", - " params={\n", - " \"hm_model\": hm,\n", - " \"strikes\": [400, 425, 460, 495, 520],\n", - " \"maturity_date\": maturity_date,\n", - " \"spot_price\": option_params['spot_price'],\n", - " \"v0\":v0_opt,\n", - " \"theta\": theta_opt,\n", - " \"kappa\":kappa_opt ,\n", - " \"sigma\": sigma_opt,\n", - " \"rho\":rho_opt\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "e359b503", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Sensitivity Testing\n", - "The sensitivity testing framework provides a systematic approach to understanding how the Heston model responds to parameter changes, which is crucial for both model validation and practical application in trading and risk management." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51922313", - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_test_provider.Sensitivity\")\n", - "def SensitivityTest(\n", - " model,\n", - " strike_price,\n", - " maturity_date,\n", - " spot_price,\n", - " v0_opt,\n", - " theta_opt,\n", - " kappa_opt,\n", - " sigma_opt,\n", - " rho_opt,\n", - "):\n", - " \"\"\"\n", - " Evaluates the sensitivity of American option prices to changes in model parameters.\n", - "\n", - " This test calculates option prices using the Heston model with optimized parameters.\n", - " It's designed to analyze how changes in various model inputs affect the option price,\n", - " which is crucial for understanding model behavior and risk management.\n", - "\n", - " Args:\n", - " model (HestonModel): Initialized Heston model instance wrapped in ValidMind model object\n", - " strike_price (float): Strike price of the option\n", - " maturity_date (ql.Date): Expiration date of the option in QuantLib format\n", - " spot_price (float): Current price of the underlying asset\n", - " v0_opt (float): Optimized initial variance parameter\n", - " theta_opt (float): Optimized long-term variance parameter\n", - " kappa_opt (float): Optimized mean reversion rate parameter\n", - " sigma_opt (float): Optimized volatility of variance parameter\n", - " rho_opt (float): Optimized correlation parameter between asset price and variance\n", - " \"\"\"\n", - " price = model.model.predict_american_option_price(\n", - " strike_price,\n", - " maturity_date,\n", - " spot_price,\n", - " v0_opt,\n", - " theta_opt,\n", - " kappa_opt,\n", - " sigma_opt,\n", - " rho_opt,\n", - " )\n", - "\n", - " return price\n" - ] - }, - { - "cell_type": "markdown", - "id": "408a05ef", - "metadata": {}, - "source": [ - "##### Common plot function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "104ca6dd", - "metadata": {}, - "outputs": [], - "source": [ - "def plot_results(df, params: dict = None):\n", - " fig2 = plt.figure(figsize=(10, 6))\n", - " plt.plot(df[params[\"x\"]], df[params[\"y\"]], label=params[\"label\"])\n", - " plt.xlabel(params[\"xlabel\"])\n", - " plt.ylabel(params[\"ylabel\"])\n", - " \n", - " plt.title(params[\"title\"])\n", - " plt.legend()\n", - " plt.grid(True)\n", - " plt.show() # close the plot to avoid displaying it" - ] - }, - { - "cell_type": "markdown", - "id": "ca72b9e5", - "metadata": {}, - "source": [ - "Let's create ValidMind model object" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae7093fa", - "metadata": {}, - "outputs": [], - "source": [ - "hm_model = vm.init_model(model=hm, input_id=\"HestonModel\")" - ] - }, - { - "cell_type": "markdown", - "id": "b2141640", - "metadata": {}, - "source": [ - "##### Strike sensitivity\n", - "Let's analyzes how option prices change as the strike price varies. We create a range of strike prices around the current strike (460) and observe the impact on option prices while keeping all other parameters constant." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea7f1cbe", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_test_provider.Sensitivity:ToStrike\",\n", - " inputs = {\n", - " \"model\": hm_model\n", - " },\n", - " param_grid={\n", - " \"strike_price\": list(np.linspace(460-50, 460+50, 10)),\n", - " \"maturity_date\": [maturity_date],\n", - " \"spot_price\": [option_params[\"spot_price\"]],\n", - " \"v0_opt\": [v0_opt],\n", - " \"theta_opt\": [theta_opt],\n", - " \"kappa_opt\": [kappa_opt],\n", - " \"sigma_opt\": [sigma_opt],\n", - " \"rho_opt\":[rho_opt]\n", - " },\n", - ")\n", - "result.log()\n", - "# Visualize how option prices change with different strike prices\n", - "plot_results(\n", - " pd.DataFrame(result.tables[0].data),\n", - " params={\n", - " \"x\": \"strike_price\",\n", - " \"y\":\"Value\",\n", - " \"label\":\"Strike price\",\n", - " \"xlabel\":\"Strike price\",\n", - " \"ylabel\":\"option price\",\n", - " \"title\":\"Heston option - Strike price Sensitivity\",\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "be143012", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Stress Testing\n", - "This stress testing framework provides a comprehensive view of how the Heston model behaves under different market conditions and helps identify potential risks in option pricing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f2f01a40", - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_custom_tests.Stressing\")\n", - "def StressTest(\n", - " model,\n", - " strike_price,\n", - " maturity_date,\n", - " spot_price,\n", - " v0_opt,\n", - " theta_opt,\n", - " kappa_opt,\n", - " sigma_opt,\n", - " rho_opt,\n", - "):\n", - " \"\"\"\n", - " Performs stress testing on Heston model parameters to evaluate option price sensitivity.\n", - "\n", - " This test evaluates how the American option price responds to stressed market conditions\n", - " by varying key model parameters. It's designed to:\n", - " 1. Identify potential model vulnerabilities\n", - " 2. Understand price behavior under extreme scenarios\n", - " 3. Support risk management decisions\n", - " 4. Validate model stability across parameter ranges\n", - "\n", - " Args:\n", - " model (HestonModel): Initialized Heston model instance wrapped in ValidMind model object\n", - " strike_price (float): Option strike price\n", - " maturity_date (ql.Date): Option expiration date in QuantLib format\n", - " spot_price (float): Current price of the underlying asset\n", - " v0_opt (float): Initial variance parameter under stress testing\n", - " theta_opt (float): Long-term variance parameter under stress testing\n", - " kappa_opt (float): Mean reversion rate parameter under stress testing\n", - " sigma_opt (float): Volatility of variance parameter under stress testing\n", - " rho_opt (float): Correlation parameter under stress testing\n", - " \"\"\"\n", - " price = model.model.predict_american_option_price(\n", - " strike_price,\n", - " maturity_date,\n", - " spot_price,\n", - " v0_opt,\n", - " theta_opt,\n", - " kappa_opt,\n", - " sigma_opt,\n", - " rho_opt,\n", - " )\n", - "\n", - " return price\n" - ] - }, - { - "cell_type": "markdown", - "id": "31fcbe9c", - "metadata": {}, - "source": [ - "##### Rho (correlation) and Theta (long term vol) stress test\n", - "Next, let's evaluates the sensitivity of a model's output to changes in the correlation parameter (rho) and the long-term variance parameter (theta) within a stochastic volatility framework." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6119b5d9", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.Stressing:TheRhoAndThetaParameters\",\n", - " inputs = {\n", - " \"model\": hm_model,\n", - " },\n", - " param_grid={\n", - " \"strike_price\": [460],\n", - " \"maturity_date\": [maturity_date],\n", - " \"spot_price\": [option_params[\"spot_price\"]],\n", - " \"v0_opt\": [v0_opt],\n", - " \"theta_opt\": list(np.linspace(0.1, theta_opt+0.4, 5)),\n", - " \"kappa_opt\": [kappa_opt],\n", - " \"sigma_opt\": [sigma_opt],\n", - " \"rho_opt\":list(np.linspace(rho_opt-0.2, rho_opt+0.2, 5))\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "be39cb3a", - "metadata": {}, - "source": [ - "##### Sigma stress test\n", - "Let's evaluates the sensitivity of a model's output to changes in the volatility parameter, sigma. This test is crucial for understanding how variations in market volatility impact the model's valuation of financial instruments, particularly options." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0dc189b7", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.Stressing:TheSigmaParameter\",\n", - " inputs = {\n", - " \"model\": hm_model,\n", - " },\n", - " param_grid={\n", - " \"strike_price\": [460],\n", - " \"maturity_date\": [maturity_date],\n", - " \"spot_price\": [option_params[\"spot_price\"]],\n", - " \"v0_opt\": [v0_opt],\n", - " \"theta_opt\": [theta_opt],\n", - " \"kappa_opt\": [kappa_opt],\n", - " \"sigma_opt\": list(np.linspace(0.1, sigma_opt+0.6, 5)),\n", - " \"rho_opt\": [rho_opt]\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "173a5294", - "metadata": {}, - "source": [ - "##### Stress kappa\n", - "Let's evaluates the sensitivity of a model's output to changes in the kappa parameter, which is a mean reversion rate in stochastic volatility models." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dae9714f", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.Stressing:TheKappaParameter\",\n", - " inputs = {\n", - " \"model\": hm_model,\n", - " },\n", - " param_grid={\n", - " \"strike_price\": [460],\n", - " \"maturity_date\": [maturity_date],\n", - " \"spot_price\": [option_params[\"spot_price\"]],\n", - " \"v0_opt\": [v0_opt],\n", - " \"theta_opt\": [theta_opt],\n", - " \"kappa_opt\": list(np.linspace(kappa_opt, kappa_opt+0.2, 5)),\n", - " \"sigma_opt\": [sigma_opt],\n", - " \"rho_opt\": [rho_opt]\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "b4d1d968", - "metadata": {}, - "source": [ - "##### Stress theta\n", - "Let's evaluates the sensitivity of a model's output to changes in the parameter theta, which represents the long-term variance in a stochastic volatility model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e68df3db", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.Stressing:TheThetaParameter\",\n", - " inputs = {\n", - " \"model\": hm_model,\n", - " },\n", - " param_grid={\n", - " \"strike_price\": [460],\n", - " \"maturity_date\": [maturity_date],\n", - " \"spot_price\": [option_params[\"spot_price\"]],\n", - " \"v0_opt\": [v0_opt],\n", - " \"theta_opt\": list(np.linspace(0.1, theta_opt+0.9, 5)),\n", - " \"kappa_opt\": [kappa_opt],\n", - " \"sigma_opt\": [sigma_opt],\n", - " \"rho_opt\": [rho_opt]\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "32e70456", - "metadata": {}, - "source": [ - "##### Stress rho\n", - "Let's evaluates the sensitivity of a model's output to changes in the correlation parameter, rho, within a stochastic volatility (SV) model framework. This test is crucial for understanding how variations in rho, which represents the correlation between the asset price and its volatility, impact the model's valuation output." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5ca3fc2", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.Stressing:TheRhoParameter\",\n", - " inputs = {\n", - " \"model\": hm_model,\n", - " },\n", - " param_grid={\n", - " \"strike_price\": [460],\n", - " \"maturity_date\": [maturity_date],\n", - " \"spot_price\": [option_params[\"spot_price\"]],\n", - " \"v0_opt\": [v0_opt],\n", - " \"theta_opt\": [theta_opt],\n", - " \"kappa_opt\": [kappa_opt],\n", - " \"sigma_opt\": [sigma_opt],\n", - " \"rho_opt\": list(np.linspace(rho_opt-0.2, rho_opt+0.2, 5))\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "892c5347", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Next steps\n", - "\n", - "You can look at the results of this test suite right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation.\n", - "\n", - "\n", - "\n", - "### Work with your model documentation\n", - "\n", - "1. From the **Model Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", - "\n", - "2. Click and expand the **Model Development** section.\n", - "\n", - "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/model-documentation/working-with-model-documentation.html)\n", - "\n", - "\n", - "\n", - "### Discover more learning resources\n", - "\n", - "We offer many interactive notebooks to help you document models:\n", - "\n", - "- [Run tests & test suites](https://docs.validmind.ai/developer/model-testing/testing-overview.html)\n", - "- [Code samples](https://docs.validmind.ai/developer/samples-jupyter-notebooks.html)\n", - "\n", - "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." - ] - }, - { - "cell_type": "markdown", - "id": "copyright-de5d1e182b09403abddabc2850f2dd05", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "validmind-1QuffXMV-py3.10", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/site/notebooks/code_samples/code_explainer/quickstart_code_explainer_demo.ipynb b/site/notebooks/code_samples/code_explainer/quickstart_code_explainer_demo.ipynb deleted file mode 100644 index 1eb1ef747e..0000000000 --- a/site/notebooks/code_samples/code_explainer/quickstart_code_explainer_demo.ipynb +++ /dev/null @@ -1,862 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Quickstart for model code documentation\n", - "\n", - "Welcome! This notebook demonstrates how to use the ValidMind code explainer to automatically generate comprehensive documentation for your codebase. The code explainer analyzes your source code and provides detailed explanations across various aspects of your implementation.\n", - "\n", - "\n", - "\n", - "## About Code Explainer\n", - "The ValidMind code explainer is a powerful tool that automatically analyzes your source code and generates comprehensive documentation. It helps you:\n", - "\n", - "- Understand the structure and organization of your codebase\n", - "- Document dependencies and environment setup\n", - "- Explain data processing and model implementation details\n", - "- Document training, evaluation, and inference pipelines\n", - "- Track configuration, testing, and security measures\n", - "\n", - "This tool is particularly useful for:\n", - "- Onboarding new team members\n", - "- Maintaining up-to-date documentation\n", - "- Ensuring code quality and best practices\n", - "- Facilitating code reviews and audits" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - "- [About Code Explainer](#toc1__) \n", - "- [About ValidMind](#toc2__) \n", - " - [Before you begin](#toc2_1__) \n", - " - [New to ValidMind?](#toc2_2__) \n", - " - [Key concepts](#toc2_3__) \n", - "- [Setting up](#toc3__) \n", - " - [Install the ValidMind Library](#toc3_1__) \n", - " - [Initialize the ValidMind Library](#toc3_2__) \n", - " - [Register sample model](#toc3_2_1__) \n", - " - [Apply documentation template](#toc3_2_2__) \n", - " - [Get your code snippet](#toc3_2_3__) \n", - " - [Preview the documentation template](#toc3_3__) \n", - "- [Common function](#toc4__) \n", - "- [Default Behavior](#toc5__) \n", - "- [Codebase Overview](#toc6__) \n", - "- [Environment and Dependencies ('environment_setup')](#toc7__) \n", - "- [Data Ingestion and Preprocessing](#toc8__) \n", - "- [Model Implementation Details](#toc9__) \n", - "- [Model Training Pipeline](#toc10__) \n", - "- [Evaluation and Validation Code](#toc11__) \n", - "- [Inference and Scoring Logic](#toc12__) \n", - "- [Configuration and Parameters](#toc13__) \n", - "- [Unit and Integration Testing](#toc14__) \n", - "- [Logging and Monitoring Hooks](#toc15__) \n", - "- [Code and Model Versioning](#toc16__) \n", - "- [Security and Access Control](#toc17__) \n", - "- [Example Runs and Scripts](#toc18__) \n", - "- [Known Issues and Future Improvements](#toc19__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
\n", - "\n", - "\n", - "\n", - "### Key concepts\n", - "\n", - "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", - "\n", - "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", - "\n", - "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Install the ValidMind Library\n", - "\n", - "To install the library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - " For example, to register a model for use with this notebook, select the following use case: `Credit Risk - CECL`\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Model Source Code Documentation`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Preview the documentation template\n", - "\n", - "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", - "\n", - "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm.preview_template()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Common function\n", - "The code above defines two key functions:\n", - "1. A function to read source code from 'customer_churn_full_suite.py' file\n", - "2. An 'explain_code' function that uses ValidMind's experimental agents to analyze and explain code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_code=\"\"\n", - "with open(\"customer_churn_full_suite.py\", \"r\") as f:\n", - " source_code = f.read()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `vm.experimental.agents.run_task` function is used to execute AI agent tasks.\n", - "\n", - "It requires:\n", - "- task: The type of task to run (e.g. `code_explainer`)\n", - "- input: A dictionary containing task-specific parameters\n", - " - For `code_explainer`, this includes:\n", - " - **source_code** (str): The code to be analyzed\n", - " - **user_instructions** (str): Instructions for how to analyze the code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def explain_code(content_id: str, user_instructions: str):\n", - " \"\"\"Run code explanation task and log the results.\n", - " By default, the code explainer includes sections for:\n", - " - Main Purpose and Overall Functionality\n", - " - Breakdown of Key Functions or Components\n", - " - Potential Risks or Failure Points \n", - " - Assumptions or Limitations\n", - " If you want default sections, specify user_instructions as an empty string.\n", - " \n", - " Args:\n", - " user_instructions (str): Instructions for how to analyze the code\n", - " content_id (str): ID to use when logging the results\n", - " \n", - " Returns:\n", - " The result object from running the code explanation task\n", - " \"\"\"\n", - " result = vm.experimental.agents.run_task(\n", - " task=\"code_explainer\",\n", - " input={\n", - " \"source_code\": source_code,\n", - " \"user_instructions\": user_instructions\n", - " }\n", - " )\n", - " result.log(content_id=content_id)\n", - " return result" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Default Behavior" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By default, the code explainer includes sections for:\n", - "- Main Purpose and Overall Functionality\n", - "- Breakdown of Key Functions or Components\n", - "- Potential Risks or Failure Points \n", - "- Assumptions or Limitations\n", - "\n", - "If you want default sections, specify `user_instructions` as an empty string. For example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = vm.experimental.agents.run_task(\n", - " task=\"code_explainer\",\n", - " input={\n", - " \"source_code\": source_code,\n", - " \"user_instructions\": \"\"\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Codebase Overview\n", - "\n", - "Let's analyze your codebase structure to understand the main modules, components, entry points and their relationships. We'll also examine the technology stack and frameworks that are being utilized in the implementation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Describe the overall structure of the source code repository.\n", - " - Identify main modules, folders, and scripts.\n", - " - Highlight entry points for training, inference, and evaluation.\n", - " - State the main programming languages and frameworks used.\n", - " \"\"\",\n", - " content_id=\"code_structure_summary\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\",\n", - " content_id=\"code_structure_summary\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Environment and Dependencies ('environment_setup')\n", - "Let's document the technical requirements and setup needed to run your code, including Python packages, system dependencies, and environment configuration files. Understanding these requirements is essential for proper development environment setup and consistent deployments across different environments." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - List Python packages and system dependencies (OS, compilers, etc.).\n", - " - Reference environment files (requirements.txt, environment.yml, Dockerfile).\n", - " - Include setup instructions using Conda, virtualenv, or containers.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"setup_instructions\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Data Ingestion and Preprocessing\n", - "Let's document how your code handles data, including data sources, validation procedures, and preprocessing steps. We'll examine the data pipeline architecture, covering everything from initial data loading through feature engineering and quality checks." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Specify data input formats and sources.\n", - " - Document ingestion, validation, and transformation logic.\n", - " - Explain how raw data is preprocessed and features are generated.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections. \"\"\",\n", - " content_id=\"data_handling_notes\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " \n", - "\n", - "\n", - "\n", - "## Model Implementation Details\n", - "Let's document the core implementation details of your model, including its architecture, components, and key algorithms. Understanding the technical implementation is crucial for maintenance, debugging, and future improvements to the codebase. We'll examine how theoretical concepts are translated into working code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Describe the core model code structure (classes, functions).\n", - " - Link code to theoretical models or equations when applicable.\n", - " - Note custom components like loss functions or feature selectors.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"model_code_description\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Model Training Pipeline\n", - "\n", - "Let's document the training pipeline implementation, including how models are trained, optimized and evaluated. We'll examine the training process workflow, hyperparameter tuning approach, and model checkpointing mechanisms. This section provides insights into how the model learns from data and achieves optimal performance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Explain the training process, optimization strategy, and hyperparameters.\n", - " - Describe logging, checkpointing, and early stopping mechanisms.\n", - " - Include references to training config files or tuning logic.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"training_logic_details\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Evaluation and Validation Code\n", - "Let's examine how the model's validation and evaluation code is implemented, including the metrics calculation and validation processes. We'll explore the diagnostic tools and visualization methods used to assess model performance. This section will also cover how validation results are logged and stored for future reference." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Describe how validation is implemented and metrics are calculated.\n", - " - Include plots and diagnostic tools (e.g., ROC, SHAP, confusion matrix).\n", - " - State how outputs are logged and persisted.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"evaluation_logic_notes\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Inference and Scoring Logic\n", - "Let's examine how the model performs inference and scoring on new data. This section will cover the implementation details of loading trained models, making predictions, and any required pre/post-processing steps. We'll also look at the APIs and interfaces available for both real-time serving and batch scoring scenarios." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Detail how the trained model is loaded and used for predictions.\n", - " - Explain I/O formats and APIs for serving or batch scoring.\n", - " - Include any preprocessing/postprocessing logic required.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"inference_mechanism\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Configuration and Parameters\n", - "Let's explore how configuration and parameters are managed in the codebase. We'll examine the configuration files, command-line arguments, environment variables, and other mechanisms used to control model behavior. This section will also cover parameter versioning and how different configurations are tracked across model iterations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Describe configuration management (files, CLI args, env vars).\n", - " - Highlight default parameters and override mechanisms.\n", - " - Reference versioning practices for config files.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"config_control_notes\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Unit and Integration Testing\n", - "Let's examine the testing strategy and implementation in the codebase. We'll analyze the unit tests, integration tests, and testing frameworks used to ensure code quality and reliability. This section will also cover test coverage metrics and continuous integration practices." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - List unit and integration tests and what they cover.\n", - " - Mention testing frameworks and coverage tools used.\n", - " - Explain testing strategy for production-readiness.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"test_strategy_overview\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Logging and Monitoring Hooks\n", - "Let's analyze how logging and monitoring are implemented in the codebase. We'll examine the logging configuration, monitoring hooks, and key metrics being tracked. This section will also cover any real-time observability integrations and alerting mechanisms in place." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Describe logging configuration and structure.\n", - " - Highlight real-time monitoring or observability integrations.\n", - " - List key events, metrics, or alerts tracked.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"logging_monitoring_notes\"\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Code and Model Versioning\n", - "Let's examine how code and model versioning is managed in the codebase. This section will cover version control practices, including Git workflows and model artifact versioning tools like DVC or MLflow. We'll also look at how versioning integrates with the CI/CD pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Describe Git usage, branching, tagging, and commit standards.\n", - " - Include model artifact versioning practices (e.g., DVC, MLflow).\n", - " - Reference any automation in CI/CD.\n", - " Please remove the following sections: \n", - " - Potential Risks or Failure Points\n", - " - Assumptions or Limitations\n", - " - Breakdown of Key Functions or Components\n", - " Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"version_tracking_description\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Security and Access Control\n", - "Let's analyze the security and access control measures implemented in the codebase. We'll examine how sensitive data and code are protected through access controls, encryption, and compliance measures. Additionally, we'll review secure deployment practices and any specific handling of PII data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Document access controls for source code and data.\n", - " - Include any encryption, PII handling, or compliance measures.\n", - " - Mention secure deployment practices.\n", - " Please remove the following sections: \n", - " - Potential Risks or Failure Points\n", - " - Assumptions or Limitations\n", - " - Breakdown of Key Functions or Components\n", - " Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"security_policies_notes\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Example Runs and Scripts\n", - "Let's explore example runs and scripts that demonstrate how to use this codebase in practice. We'll look at working examples, command-line usage, and sample notebooks that showcase the core functionality. This section will also point to demo datasets and test scenarios that can help new users get started quickly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Provide working script examples.\n", - " - Include CLI usage instructions or sample notebooks.\n", - " - Link to demo datasets or test scenarios.\n", - " Please remove the following sections: \n", - " - Potential Risks or Failure Points\n", - " - Assumptions or Limitations\n", - " - Breakdown of Key Functions or Components\n", - " Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"runnable_examples\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Known Issues and Future Improvements\n", - "Let's examine the current limitations and areas for improvement in the codebase. This section will document known technical debt, bugs, and feature gaps that need to be addressed. We'll also outline proposed enhancements and reference any existing tickets or GitHub issues tracking these improvements." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - List current limitations or technical debt.\n", - " - Outline proposed enhancements or refactors.\n", - " - Reference relevant tickets, GitHub issues, or roadmap items.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"issues_and_improvements_log\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "copyright-ccbede139a26452183291a108b791513", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "validmind-1QuffXMV-py3.11", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/site/notebooks/code_samples/custom_tests/implement_custom_tests.ipynb b/site/notebooks/code_samples/custom_tests/implement_custom_tests.ipynb deleted file mode 100644 index 17385b8e1c..0000000000 --- a/site/notebooks/code_samples/custom_tests/implement_custom_tests.ipynb +++ /dev/null @@ -1,1095 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Implement custom tests\n", - "\n", - "Custom tests extend the functionality of ValidMind, allowing you to document any model or use case with added flexibility.\n", - "\n", - "ValidMind provides a comprehensive set of tests out-of-the-box to evaluate and document your models and datasets. We recognize there will be cases where the default tests do not support a model or dataset, or specific documentation is needed. In these cases, you can create and use your own custom code to accomplish what you need. To streamline custom code integration, we support the creation of custom test functions.\n", - "\n", - "This interactive notebook provides a step-by-step guide for implementing and registering custom tests with ValidMind, running them individually, viewing the results on the ValidMind Platform, and incorporating them into your model documentation template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - "- [About ValidMind](#toc1__) \n", - " - [Before you begin](#toc1_1__) \n", - " - [New to ValidMind?](#toc1_2__) \n", - " - [Key concepts](#toc1_3__) \n", - "- [Setting up](#toc2__) \n", - " - [Install the ValidMind Library](#toc2_1__) \n", - " - [Initialize the ValidMind Library](#toc2_2__) \n", - " - [Register sample model](#toc2_2_1__) \n", - " - [Apply documentation template](#toc2_2_2__) \n", - " - [Get your code snippet](#toc2_2_3__) \n", - "- [Implement a Custom Test](#toc3__) \n", - "- [Run the Custom Test](#toc4__) \n", - " - [Setup the Model and Dataset](#toc4_1__) \n", - " - [Run the Custom Test](#toc4_2__) \n", - "- [Adding Custom Test to Model Documentation](#toc5__) \n", - "- [Some More Custom Tests](#toc6__) \n", - " - [Custom Test: Table of Model Hyperparameters](#toc6_1__) \n", - " - [Custom Test: External API Call](#toc6_2__) \n", - " - [Custom Test: Passing Parameters](#toc6_3__) \n", - " - [Custom Test: Multiple Tables and Plots in a Single Test](#toc6_4__) \n", - " - [Custom Test: Images](#toc6_5__) \n", - " - [Custom Test: Description](#toc6_6__) \n", - "- [Conclusion](#toc7__) \n", - "- [Next steps](#toc8__) \n", - " - [Work with your model documentation](#toc8_1__) \n", - " - [Discover more learning resources](#toc8_2__) \n", - "- [Upgrade ValidMind](#toc9__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
\n", - "\n", - "\n", - "\n", - "### Key concepts\n", - "\n", - "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", - "\n", - "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Custom test can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", - "\n", - "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Install the ValidMind Library\n", - "\n", - "To install the library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - " For example, to register a model for use with this notebook, select the following use case: `{use-case}`\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Binary classification`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Implement a Custom Test\n", - "\n", - "Let's start off by creating a simple custom test that creates a Confusion Matrix for a binary classification model. We will use the `sklearn.metrics.confusion_matrix` function to calculate the confusion matrix and then display it as a heatmap using `plotly`. (This is already a built-in test in ValidMind, but we will use it as an example to demonstrate how to create custom tests.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "from sklearn import metrics\n", - "\n", - "\n", - "@vm.test(\"my_custom_tests.ConfusionMatrix\")\n", - "def confusion_matrix(dataset, model):\n", - " \"\"\"The confusion matrix is a table that is often used to describe the performance of a classification model on a set of data for which the true values are known.\n", - "\n", - " The confusion matrix is a 2x2 table that contains 4 values:\n", - "\n", - " - True Positive (TP): the number of correct positive predictions\n", - " - True Negative (TN): the number of correct negative predictions\n", - " - False Positive (FP): the number of incorrect positive predictions\n", - " - False Negative (FN): the number of incorrect negative predictions\n", - "\n", - " The confusion matrix can be used to assess the holistic performance of a classification model by showing the accuracy, precision, recall, and F1 score of the model on a single figure.\n", - " \"\"\"\n", - " y_true = dataset.y\n", - " y_pred = dataset.y_pred(model)\n", - "\n", - " confusion_matrix = metrics.confusion_matrix(y_true, y_pred)\n", - "\n", - " cm_display = metrics.ConfusionMatrixDisplay(\n", - " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", - " )\n", - " cm_display.plot()\n", - "\n", - " plt.close() # close the plot to avoid displaying it\n", - "\n", - " return cm_display.figure_ # return the figure object itself" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Thats our custom test defined and ready to go... Let's take a look at whats going on here:\n", - "\n", - "- The function `confusion_matrix` takes two arguments `dataset` and `model`. This is a VMDataset and VMModel object respectively.\n", - "- The function docstring provides a description of what the test does. This will be displayed along with the result in this notebook as well as in the ValidMind Platform.\n", - "- The function body calculates the confusion matrix using the `sklearn.metrics.confusion_matrix` function and then plots it using `sklearn.metric.ConfusionMatrixDisplay`.\n", - "- The function then returns the `ConfusionMatrixDisplay.figure_` object - this is important as the ValidMind Library expects the output of the custom test to be a plot or a table.\n", - "- The `@vm.test` decorator is doing the work of creating a wrapper around the function that will allow it to be run by the ValidMind Library. It also registers the test so it can be found by the ID `my_custom_tests.ConfusionMatrix` (see the section below on how test IDs work in ValidMind and why this format is important)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Run the Custom Test\n", - "\n", - "Now that we have defined and registered our custom test, lets see how we can run it and properly use it in the ValidMind Platform." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Setup the Model and Dataset\n", - "\n", - "First let's setup a an example model and dataset to run our custom metic against. Since this is a Confusion Matrix, we will use the Customer Churn dataset that ValidMind provides and train a simple XGBoost model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import xgboost as xgb\n", - "from validmind.datasets.classification import customer_churn\n", - "\n", - "raw_df = customer_churn.load_data()\n", - "train_df, validation_df, test_df = customer_churn.preprocess(raw_df)\n", - "\n", - "x_train = train_df.drop(customer_churn.target_column, axis=1)\n", - "y_train = train_df[customer_churn.target_column]\n", - "x_val = validation_df.drop(customer_churn.target_column, axis=1)\n", - "y_val = validation_df[customer_churn.target_column]\n", - "\n", - "model = xgb.XGBClassifier(early_stopping_rounds=10)\n", - "model.set_params(\n", - " eval_metric=[\"error\", \"logloss\", \"auc\"],\n", - ")\n", - "model.fit(\n", - " x_train,\n", - " y_train,\n", - " eval_set=[(x_val, y_val)],\n", - " verbose=False,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Easy enough! Now we have a model and dataset setup and trained. One last thing to do is bring the dataset and model into the ValidMind Library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# for now, we'll just use the test dataset\n", - "vm_test_ds = vm.init_dataset(\n", - " dataset=test_df,\n", - " target_column=customer_churn.target_column,\n", - " input_id=\"test_dataset\",\n", - ")\n", - "\n", - "vm_model = vm.init_model(model, input_id=\"model\")\n", - "\n", - "# link the model to the dataset\n", - "vm_test_ds.assign_predictions(model=vm_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Run the Custom Test\n", - "\n", - "Now that we have our model and dataset setup, we have everything we need to run our custom test. We can do this by importing the `run_test` function from the `validmind.tests` module and passing in the test ID of our custom test along with the model and dataset we want to run it against.\n", - "\n", - ">Notice how the `inputs` dictionary is used to map an `input_id` which we set above to the `model` and `dataset` keys that are expected by our custom test function. This is how the ValidMind Library knows which inputs to pass to different tests and is key when using many different datasets and models." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.tests import run_test\n", - "\n", - "result = run_test(\n", - " \"my_custom_tests.ConfusionMatrix\",\n", - " inputs={\"model\": \"model\", \"dataset\": \"test_dataset\"},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You'll notice that the docstring becomes a markdown description of the test. The figure is then displayed as the test result. What you see above is how it will look in the ValidMind Platform as well. Let's go ahead and log the result to see how that works." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Adding Custom Test to Model Documentation\n", - "\n", - "To do this, go to the documentation page of the model you registered above and navigate to the `Model Development` -> `Model Evaluation` section. Then hover between any existing content block to reveal the `+` button as shown in the screenshot below.\n", - "\n", - "![screenshot showing insert button for test-driven blocks](../../images/insert-test-driven-block.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now click on the `+` button and select the `Test-Driven Block` option. This will open a dialog where you can select `My Custom Tests Confusion Matrix` from the list of available tests. You can preview the result and then click `Insert Block` to add it to the documentation.\n", - "\n", - "![screenshot showing how to insert a test-driven block](../../images/insert-test-driven-block-custom.png)\n", - "\n", - "The test should match the result you see above. It is now part of your documentation and will now be run everytime you run `vm.run_documentation_tests()` for your model. Let's do that now." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm.reload()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you preview the template, it should show the custom test in the `Model Development`->`Model Evaluation` section:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm.preview_template()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Just so we can run all of the tests in the template, let's initialize the train and raw dataset.\n", - "\n", - "(Refer to [**Quickstart for model documentation**](../../quickstart/quickstart_model_documentation.ipynb) and the ValidMind docs for more information on what we are doing here)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_raw_dataset = vm.init_dataset(\n", - " dataset=raw_df,\n", - " input_id=\"raw_dataset\",\n", - " target_column=customer_churn.target_column,\n", - " class_labels=customer_churn.class_labels,\n", - ")\n", - "\n", - "vm_train_ds = vm.init_dataset(\n", - " dataset=train_df,\n", - " input_id=\"train_dataset\",\n", - " target_column=customer_churn.target_column,\n", - ")\n", - "vm_train_ds.assign_predictions(model=vm_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To run all the tests in the template, you can use the `vm.run_documentation_tests()` and pass the inputs we initialized above and the demo config from our customer_churn module. We will have to add a section to the config for our new test to tell it which inputs it should receive. This is done by simply adding a new element in the config dictionary where the key is the ID of the test and the value is a dictionary with the following structure:\n", - "```python\n", - "{\n", - " \"inputs\": {\n", - " \"model\": \"test_dataset\",\n", - " \"dataset\": \"model\",\n", - " }\n", - "}\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.utils import preview_test_config\n", - "\n", - "test_config = customer_churn.get_demo_test_config()\n", - "test_config[\"my_custom_tests.ConfusionMatrix\"] = {\n", - " \"inputs\": {\n", - " \"dataset\": \"test_dataset\",\n", - " \"model\": \"model\",\n", - " }\n", - "}\n", - "preview_test_config(test_config)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "full_suite = vm.run_documentation_tests(config=test_config)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Some More Custom Tests\n", - "\n", - "Now that you understand the entire process of creating custom tests and using them in your documentation, let's create a few more to see different ways you can utilize custom tests." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Custom Test: Table of Model Hyperparameters\n", - "\n", - "This custom test will display a table of the hyperparameters used in the model:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_custom_tests.Hyperparameters\")\n", - "def hyperparameters(model):\n", - " \"\"\"The hyperparameters of a machine learning model are the settings that control the learning process.\n", - " These settings are specified before the learning process begins and can have a significant impact on the\n", - " performance of the model.\n", - "\n", - " The hyperparameters of a model can be used to tune the model to achieve the best possible performance\n", - " on a given dataset. By examining the hyperparameters of a model, you can gain insight into how the model\n", - " was trained and how it might be improved.\n", - " \"\"\"\n", - " hyperparameters = model.model.get_xgb_params() # dictionary of hyperparameters\n", - "\n", - " # turn the dictionary into a table where each row contains a hyperparameter and its value\n", - " return [{\"Hyperparam\": k, \"Value\": v} for k, v in hyperparameters.items() if v]\n", - "\n", - "\n", - "result = run_test(\"my_custom_tests.Hyperparameters\", inputs={\"model\": \"model\"})\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since the test has been run and logged, you can add it to your documentation using the same process as above. It should look like this:\n", - "\n", - "![screenshot showing hyperparameters test](../../images/hyperparameters-custom-metric.png)\n", - "\n", - "For our simple toy model, there are aren't really any proper hyperparameters but you can see how this could be useful for more complex models that have gone through hyperparameter tuning." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Custom Test: External API Call\n", - "\n", - "This custom test will make an external API call to get the current BTC price and display it as a table. This demonstrates how you might integrate external data sources into your model documentation in a programmatic way. You could, for instance, setup a pipeline that runs a test like this every day to keep your model documentation in sync with an external system." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import random\n", - "\n", - "\n", - "@vm.test(\"my_custom_tests.ExternalAPI\")\n", - "def external_api():\n", - " \"\"\"This test calls an external API to get a list of fake users. It then creates\n", - " a table with the relevant data so it can be displayed in the documentation.\n", - "\n", - " The purpose of this test is to demonstrate how to call an external API and use the\n", - " data in a test. A test like this could even be setup to run in a scheduled\n", - " pipeline to keep your documentation in-sync with an external data source.\n", - " \"\"\"\n", - " url = \"https://jsonplaceholder.typicode.com/users\"\n", - " response = requests.get(url)\n", - " data = response.json()\n", - "\n", - " # extract the time and the current BTC price in USD\n", - " return {\n", - " \"Model Owners/Stakeholders\": [\n", - " {\n", - " \"Name\": user[\"name\"],\n", - " \"Role\": random.choice([\"Owner\", \"Stakeholder\"]),\n", - " \"Email\": user[\"email\"],\n", - " \"Phone\": user[\"phone\"],\n", - " \"Slack Handle\": f\"@{user['name'].lower().replace(' ', '.')}\",\n", - " }\n", - " for user in data[:3]\n", - " ],\n", - " \"Model Developers\": [\n", - " {\n", - " \"Name\": user[\"name\"],\n", - " \"Role\": \"Developer\",\n", - " \"Email\": user[\"email\"],\n", - " }\n", - " for user in data[3:7]\n", - " ],\n", - " \"Model Validators\": [\n", - " {\n", - " \"Name\": user[\"name\"],\n", - " \"Role\": \"Validator\",\n", - " \"Email\": user[\"email\"],\n", - " }\n", - " for user in data[7:]\n", - " ],\n", - " }\n", - "\n", - "\n", - "result = run_test(\"my_custom_tests.ExternalAPI\")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Again, you can add this to your documentation to see how it looks:\n", - "\n", - "![screenshot showing BTC price metric](../../images/external-data-custom-test.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Custom Test: Passing Parameters\n", - "\n", - "Custom test functions, as stated earlier, can take both inputs and params. When you define your function there is no need to distinguish between the two, the ValidMind Library will handle that for you. You simply need to add both to the function as arguments and the library will pass in the correct values.\n", - "\n", - "So for instance, if you wanted to parameterize the first custom test we created, the confusion matrix, you could do so like this:\n", - "\n", - "```python\n", - "def confusion_matrix(dataset: VMDataset, model: VMModel, my_param: str = \"Default Value\"):\n", - " pass\n", - "```\n", - "\n", - "And then when you run the test, you can pass in the parameter like this:\n", - "\n", - "```python\n", - "vm.run_test(\n", - " \"my_custom_tests.ConfusionMatrix\",\n", - " inputs={\"model\": \"model\", \"dataset\": \"test_dataset\"},\n", - " params={\"my_param\": \"My Value\"},\n", - ")\n", - "```\n", - "\n", - "Or if you are running the entire documentation template, you would update the config like this:\n", - "\n", - "```python\n", - "test_config[\"my_custom_tests.ConfusionMatrix\"] = {\n", - " \"inputs\": {\n", - " \"dataset\": \"test_dataset\",\n", - " \"model\": \"model\",\n", - " },\n", - " \"params\": {\n", - " \"my_param\": \"My Value\",\n", - " },\n", - "}\n", - "```\n", - "\n", - "Let's go ahead and create a toy test that takes a parameter and uses it in the result:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import plotly.express as px\n", - "\n", - "\n", - "@vm.test(\"my_custom_tests.ParameterExample\")\n", - "def parameter_example(\n", - " plot_title=\"Default Plot Title\", x_col=\"sepal_width\", y_col=\"sepal_length\"\n", - "):\n", - " \"\"\"This test takes two parameters and creates a scatter plot based on them.\n", - "\n", - " The purpose of this test is to demonstrate how to create a test that takes\n", - " parameters and uses them to generate a plot. This can be useful for creating\n", - " tests that are more flexible and can be used in a variety of scenarios.\n", - " \"\"\"\n", - " # return px.scatter(px.data.iris(), x=x_col, y=y_col, color=\"species\")\n", - " return px.scatter(\n", - " px.data.iris(), x=x_col, y=y_col, color=\"species\", title=plot_title\n", - " )\n", - "\n", - "\n", - "result = run_test(\n", - " \"my_custom_tests.ParameterExample\",\n", - " params={\n", - " \"plot_title\": \"My Cool Plot\",\n", - " \"x_col\": \"sepal_width\",\n", - " \"y_col\": \"sepal_length\",\n", - " },\n", - ")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Play around with this and see how you can use parameters, default values and other features to make your custom tests more flexible and useful.\n", - "\n", - "Here's how this one looks in the documentation:\n", - "![screenshot showing parameterized test](../../images/parameterized-custom-metric.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Custom Test: Multiple Tables and Plots in a Single Test\n", - "\n", - "Custom test functions, as stated earlier, can return more than just one table or plot. In fact, any number of tables and plots can be returned. Let's see an example of this:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import plotly.express as px\n", - "\n", - "\n", - "@vm.test(\"my_custom_tests.ComplexOutput\")\n", - "def complex_output():\n", - " \"\"\"This test demonstrates how to return many tables and figures in a single test\"\"\"\n", - " # create a couple tables\n", - " table = [{\"A\": 1, \"B\": 2}, {\"A\": 3, \"B\": 4}]\n", - " table2 = [{\"C\": 5, \"D\": 6}, {\"C\": 7, \"D\": 8}]\n", - "\n", - " # create a few figures showing some random data\n", - " fig1 = px.line(x=np.arange(10), y=np.random.rand(10), title=\"Random Line Plot\")\n", - " fig2 = px.bar(x=[\"A\", \"B\", \"C\"], y=np.random.rand(3), title=\"Random Bar Plot\")\n", - " fig3 = px.scatter(\n", - " x=np.random.rand(10), y=np.random.rand(10), title=\"Random Scatter Plot\"\n", - " )\n", - "\n", - " return (\n", - " {\n", - " \"My Cool Table\": table,\n", - " \"Another Table\": table2,\n", - " },\n", - " fig1,\n", - " fig2,\n", - " fig3,\n", - " )\n", - "\n", - "\n", - "result = run_test(\"my_custom_tests.ComplexOutput\")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice how you can return the tables as a dictionary where the key is the title of the table and the value is the table itself. You could also just return the tables by themselves but this way you can give them a title to more easily identify them in the result.\n", - "\n", - "![screenshot showing multiple tables and plots](../../images/multiple-tables-plots-custom-metric.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Custom Test: Images\n", - "\n", - "If you are using a plotting library that isn't supported by ValidMind (i.e. not `matplotlib` or `plotly`), you can still return the image directly as a bytes-like object. This could also be used to bring any type of image into your documentation in a programmatic way. For instance, you may want to include a diagram of your model architecture or a screenshot of a dashboard that your model is integrated with. As long as you can produce the image with Python or open it from a file, you can include it in your documentation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import io\n", - "import matplotlib.pyplot as plt\n", - "\n", - "\n", - "@vm.test(\"my_custom_tests.Image\")\n", - "def image():\n", - " \"\"\"This test demonstrates how to return an image in a test\"\"\"\n", - "\n", - " # create a simple plot\n", - " fig, ax = plt.subplots()\n", - " ax.plot([1, 2, 3, 4])\n", - " ax.set_title(\"Simple Line Plot\")\n", - "\n", - " # save the plot as a PNG image (in-memory buffer)\n", - " img_data = io.BytesIO()\n", - " fig.savefig(img_data, format=\"png\")\n", - " img_data.seek(0)\n", - "\n", - " plt.close() # close the plot to avoid displaying it\n", - "\n", - " return img_data.read()\n", - "\n", - "\n", - "result = run_test(\"my_custom_tests.Image\")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Adding this custom test to your documentation will display the image:\n", - "\n", - "![screenshot showing image custom test](../../images/image-in-custom-metric.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to log an image as a test result, you can do so by passing the path to the image as a parameter to the custom test and then opening the file in the test function. Here's an example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_custom_tests.MyPNGCorrelationMatrix\")\n", - "def Image(path: str):\n", - " \"\"\"Opens a png image file and logs it as a test result to ValidMind\"\"\"\n", - " if not path.endswith(\".png\"):\n", - " raise ValueError(\"Image must be a PNG file\")\n", - "\n", - " # return raw image bytes\n", - " with open(path, \"rb\") as f:\n", - " return f.read()\n", - " \n", - "run_test(\n", - " \"my_custom_tests.MyPNGCorrelationMatrix\",\n", - " params={\"path\": \"../../images/pearson-correlation-matrix.png\"},\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The image is displayed in the test result:\n", - "\n", - "![screenshot showing image from file](../../images/pearson-correlation-matrix-test-output.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Custom Test: Description\n", - "\n", - "If you want to write a custom test description for your custom test instead of it is interpreted through llm, you can do so by returning string in your test." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "@vm.test(\"my_custom_tests.MyCustomTest\")\n", - "def my_custom_test(dataset, model):\n", - " \"\"\"\n", - " This is a custom computed test that computes confusion matrix for a binary classification model and return a string as a test description.\n", - " \"\"\"\n", - " y_true = dataset.y\n", - " y_pred = dataset.y_pred(model)\n", - "\n", - " confusion_matrix = metrics.confusion_matrix(y_true, y_pred)\n", - "\n", - " cm_display = metrics.ConfusionMatrixDisplay(\n", - " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", - " )\n", - " cm_display.plot()\n", - "\n", - " plt.close() # close the plot to avoid displaying it\n", - "\n", - " return cm_display.figure_, \"Test Description - Confusion Matrix\", pd.DataFrame({\"Value\": [1, 2, 3]}) # return the figure object itself\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can see here test result description has been customized here. The same result description will be displayed in the UI." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.MyCustomTest\",\n", - " inputs={\"model\": \"model\", \"dataset\": \"test_dataset\"},\n", - ")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Conclusion\n", - "\n", - "In this notebook, we have demonstrated how to create custom tests in ValidMind. We have shown how to define custom test functions, register them with the ValidMind Library, run them against models and datasets, and add them to model documentation templates. We have also shown how to return tables and plots from custom tests and how to use them in the ValidMind Platform. We hope this tutorial has been helpful in understanding how to create and use custom tests in ValidMind." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Next steps\n", - "\n", - "You can look at the results of this test suite right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation.\n", - "\n", - "\n", - "\n", - "### Work with your model documentation\n", - "\n", - "1. From the **Model Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", - "\n", - "2. Click and expand the **Model Development** section.\n", - "\n", - "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/model-documentation/working-with-model-documentation.html)\n", - "\n", - "\n", - "\n", - "### Discover more learning resources\n", - "\n", - "We offer many interactive notebooks to help you document models:\n", - "\n", - "- [Run tests & test suites](https://docs.validmind.ai/developer/model-testing/testing-overview.html)\n", - "- [Code samples](https://docs.validmind.ai/developer/samples-jupyter-notebooks.html)\n", - "\n", - "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Upgrade ValidMind\n", - "\n", - "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", - "\n", - "Retrieve the information for the currently installed version of ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip show validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", - "\n", - "```bash\n", - "%pip install --upgrade validmind\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You may need to restart your kernel after running the upgrade package for changes to be applied." - ] - }, - { - "cell_type": "markdown", - "id": "copyright-997b933948594ddd929ee9419957dfe3", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/site/notebooks/code_samples/nlp_and_llm/rag_benchmark_demo.ipynb b/site/notebooks/code_samples/nlp_and_llm/rag_benchmark_demo.ipynb deleted file mode 100644 index 35969b1eb3..0000000000 --- a/site/notebooks/code_samples/nlp_and_llm/rag_benchmark_demo.ipynb +++ /dev/null @@ -1,1857 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# RAG Model Benchmarking Demo\n", - "\n", - "In this notebook, we are going to implement a simple RAG Model for automating the process of answering RFP questions using GenAI. We will see how we can initialize an embedding model, a retrieval model and a generator model with LangChain components and use them within the ValidMind Library to run tests against them. We'll demonstrate how to set up multiple models for benchmarking at each stage of the RAG pipeline - specifically two embedding models, two retrieval models with different parameters, and two LLM models (GPT-3.5 and GPT-4o) - allowing for comparison of performance across different configurations. Finally, we will see how we can put them together in a Pipeline and run that to get e2e results and run tests against that." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - "- [About ValidMind](#toc1__) \n", - " - [Before you begin](#toc1_1__) \n", - " - [New to ValidMind?](#toc1_2__) \n", - " - [Key concepts](#toc1_3__) \n", - "- [Setting up](#toc2__) \n", - " - [Install the ValidMind Library](#toc2_1__) \n", - " - [Initialize the ValidMind Library](#toc2_2__) \n", - " - [Register sample model](#toc2_2_1__) \n", - " - [Apply documentation template](#toc2_2_2__) \n", - " - [Get your code snippet](#toc2_2_3__) \n", - "- [Read Open AI API Key](#toc3__) \n", - "- [Dataset Loader](#toc4__) \n", - "- [Data validation](#toc5__) \n", - " - [Duplicates](#toc5_1__) \n", - " - [Stop Words](#toc5_2__) \n", - " - [Punctuations](#toc5_3__) \n", - " - [Common Words](#toc5_4__) \n", - " - [Language Detection](#toc5_5__) \n", - " - [Toxicity Score](#toc5_6__) \n", - " - [Polarity and Subjectivity](#toc5_7__) \n", - " - [Sentiment](#toc5_8__) \n", - " - [Assign Predictions](#toc5_9__) \n", - " - [Run tests](#toc5_10__) \n", - " - [Generate embeddings for the Train Set](#toc5_11__) \n", - " - [Insert embeddings and questions into Vector DB](#toc5_12__) \n", - "- [Prompt Evaluation](#toc6__) \n", - "- [RAGAS evaluation](#toc7__) \n", - " - [Semantic Similarity](#toc7_1__) \n", - " - [Context Entity Recall](#toc7_2__) \n", - " - [Context Precision](#toc7_3__) \n", - " - [Context Precision Without Reference](#toc7_4__) \n", - " - [Faithfulness](#toc7_5__) \n", - " - [Response Relevancy](#toc7_6__) \n", - " - [Context Recall](#toc7_7__) \n", - " - [Answer Correctness](#toc7_8__) \n", - " - [Aspect Critic](#toc7_9__) \n", - " - [Noise Sensitivity](#toc7_10__) \n", - "- [Generation quality](#toc8__) \n", - " - [Token Disparity](#toc8_1__) \n", - " - [ROUGE Score](#toc8_2__) \n", - " - [BLEU Score](#toc8_3__) \n", - " - [BERT Score](#toc8_4__) \n", - " - [METEOR Score](#toc8_5__) \n", - "- [Bias and Toxicity](#toc9__) \n", - " - [Toxicity Score](#toc9_1__) \n", - " - [Regard Score](#toc9_2__) \n", - "- [Upgrade ValidMind](#toc10__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
\n", - "\n", - "\n", - "\n", - "### Key concepts\n", - "\n", - "- **FunctionModels**: ValidMind offers support for creating `VMModel` instances from Python functions. This enables us to support any \"model\" by simply using the provided function as the model's `predict` method.\n", - "- **PipelineModels**: ValidMind models (`VMModel` instances) of any type can be piped together to create a model pipeline. This allows model components to be created and tested/documented independently, and then combined into a single model for end-to-end testing and documentation. We use the `|` operator to pipe models together.\n", - "- **RAG**: RAG stands for Retrieval Augmented Generation and refers to a wide range of GenAI applications where some form of retrieval is used to add context to the prompt so that the LLM that generates content can refer to it when creating its output. In this notebook, we are going to implement a simple RAG setup using LangChain components." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Prerequisites\n", - "\n", - "Let's go ahead and install the `validmind` library if its not already installed... Then we can install the `qdrant-client` library for our vector store and `langchain` for everything else:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q \"validmind[llm]\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q qdrant-client langchain langchain-openai sentencepiece" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Install the ValidMind Library\n", - "\n", - "
Recommended Python versions\n", - "

\n", - "Python 3.8 <= x <= 3.11
\n", - "\n", - "To install the library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Analytics`\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Gen AI RAG Template`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", - " api_key = \"...\",\n", - " api_secret = \"...\",\n", - " model = \"...\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Read Open AI API Key\n", - "\n", - "We will need to have an OpenAI API key to be able to use their `text-embedding-3-small` and `text-embedding-3-large` models for our embeddings, `gpt-3.5-turbo` and `gpt-4o` models for our generator and `gpt-4o` model for our LLM-as-Judge tests. If you don't have an OpenAI API key, you can get one by signing up at [OpenAI](https://platform.openai.com/signup). Then you can create a `.env` file in the root of your project and the following cell will load it from there. Alternatively, you can just uncomment the line below to directly set the key (not recommended for security reasons)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load openai api key\n", - "import os\n", - "\n", - "import dotenv\n", - "import nltk\n", - "\n", - "dotenv.load_dotenv()\n", - "nltk.download('stopwords')\n", - "nltk.download('punkt_tab')\n", - "\n", - "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", - "\n", - "if not \"OPENAI_API_KEY\" in os.environ:\n", - " raise ValueError(\"OPENAI_API_KEY is not set\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Dataset Loader\n", - "\n", - "Great, now that we have all of our dependencies installed, the ValidMind Library initialized and connected to our model and our OpenAI API key setup, we can go ahead and load our datasets. We will use the synthetic `RFP` dataset included with ValidMind for this notebook. This dataset contains a variety of RFP questions and ground truth answers that we can use both as the source where our Retriever will search for similar question-answer pairs as well as our test set for evaluating the performance of our RAG model. To do this, we just have to load it and call the preprocess function to get a split of the data into train and test sets." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Import the sample dataset from the library\n", - "from validmind.datasets.llm.rag import rfp\n", - "\n", - "raw_df = rfp.load_data()\n", - "train_df, test_df = rfp.preprocess(raw_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_train_ds = vm.init_dataset(\n", - " train_df,\n", - " text_column=\"question\",\n", - " target_column=\"ground_truth\",\n", - ")\n", - "\n", - "vm_test_ds = vm.init_dataset(\n", - " test_df,\n", - " text_column=\"question\",\n", - " target_column=\"ground_truth\",\n", - ")\n", - "\n", - "vm_test_ds.df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Data validation\n", - "\n", - "Now that we have loaded our dataset, we can go ahead and run some data validation tests right away to start assessing and documenting the quality of our data. Since we are using a text dataset, we can use ValidMind's built-in array of text data quality tests to check that things like number of duplicates, missing values, and other common text data issues are not present in our dataset. We can also run some tests to check the sentiment and toxicity of our data." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Duplicates\n", - "\n", - "First, let's check for duplicates in our dataset. We can use the `validmind.data_validation.Duplicates` test and pass our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.tests import run_test\n", - "\n", - "run_test(\n", - " test_id=\"validmind.data_validation.Duplicates\",\n", - " inputs={\"dataset\": vm_train_ds},\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Stop Words\n", - "\n", - "Next, let's check for stop words in our dataset. We can use the `validmind.data_validation.StopWords` test and pass our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.StopWords\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Punctuations\n", - "\n", - "Next, let's check for punctuations in our dataset. We can use the `validmind.data_validation.Punctuations` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.Punctuations\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Common Words\n", - "\n", - "Next, let's check for common words in our dataset. We can use the `validmind.data_validation.CommonWord` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.CommonWords\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Language Detection\n", - "\n", - "For documentation purposes, we can detect and log the languages used in the dataset with the `validmind.data_validation.LanguageDetection` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.LanguageDetection\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Toxicity Score\n", - "\n", - "Now, let's go ahead and run the `validmind.data_validation.nlp.Toxicity` test to compute a toxicity score for our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.Toxicity\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Polarity and Subjectivity\n", - "\n", - "We can also run the `validmind.data_validation.nlp.PolarityAndSubjectivity` test to compute the polarity and subjectivity of our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.PolarityAndSubjectivity\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Sentiment\n", - "\n", - "Finally, we can run the `validmind.data_validation.nlp.Sentiment` test to plot the sentiment of our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.Sentiment\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Embedding Model\n", - "\n", - "Now that we have our dataset loaded and have run some data validation tests to assess and document the quality of our data, we can go ahead and initialize our embedding model. We will use `text-embedding-3-small` and `text-embedding-3-large` models from OpenAI for this purpose wrapped in the `OpenAIEmbeddings` class from LangChain. This model will be used to \"embed\" our questions both for inserting the question-answer pairs from the \"train\" set into the vector store and for embedding the question from inputs when making predictions with our RAG model." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_openai import OpenAIEmbeddings\n", - "\n", - "embedding_small_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", - "\n", - "\n", - "def embed_small(input):\n", - " \"\"\"Returns a text embedding for the given text\"\"\"\n", - " return embedding_small_client.embed_query(input[\"question\"])\n", - "\n", - "\n", - "vm_embedder_small = vm.init_model(input_id=\"embedding_small_model\", predict_fn=embed_small)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embedding_large_client = OpenAIEmbeddings(model=\"text-embedding-3-large\")\n", - "\n", - "\n", - "def embed_large(input):\n", - " \"\"\"Returns a text embedding for the given text\"\"\"\n", - " return embedding_large_client.embed_query(input[\"question\"])\n", - "\n", - "\n", - "vm_embedder_large = vm.init_model(input_id=\"embedding_large_model\", predict_fn=embed_large)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "What we have done here is to initialize the `OpenAIEmbeddings` class so it uses OpenAI's `text-embedding-3-small` and `text-embedding-3-large` models. We then created an `embed` function that takes in an `input` dictionary and uses the `embed_query` method of the embedding client to compute the embeddings of the `question`. We use an `embed` function since that is how ValidMind supports any custom model. We will use this strategy for the retrieval and generator models as well but you could also use, say, a HuggingFace model directly. See the documentation for more information on which model types are directly supported - [ValidMind Documentation](https://docs.validmind.ai/validmind/validmind.html)... Finally, we use the `init_model` function from the ValidMind Library to create a `VMModel` object that can be used in ValidMind tests. This also logs the model to our model documentation and any test that uses the model will be linked to the logged model and its metadata." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Assign Predictions\n", - "\n", - "To precompute the embeddings for our test set, we can call the `assign_predictions` method of our `vm_test_ds` object we created above. This will compute the embeddings for each question in the test set and store them in the a special prediction column of the test set thats linked to our `vm_embedder` model. This will allow us to use these embeddings later when we run tests against our embedding model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds.assign_predictions(vm_embedder_small)\n", - "vm_test_ds.assign_predictions(vm_embedder_large)\n", - "print(vm_test_ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Run tests\n", - "\n", - "Now that everything is setup for the embedding model, we can go ahead and run some tests to assess and document the quality of our embeddings. We will use the `validmind.model_validation.embeddings.*` tests to compute a variety of metrics against our model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.StabilityAnalysisRandomNoise\",\n", - " input_grid={\n", - " \"model\": [vm_embedder_small, vm_embedder_large],\n", - " \"dataset\": [vm_test_ds],\n", - " },\n", - " params={\n", - " \"probability\": 0.3,\n", - " \"mean_similarity_threshold\": 0.7,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.StabilityAnalysisSynonyms\",\n", - " input_grid={\n", - " \"model\": [vm_embedder_small, vm_embedder_large],\n", - " \"dataset\": [vm_test_ds],\n", - " },\n", - " params={\n", - " \"probability\": 0.3,\n", - " \"mean_similarity_threshold\": 0.7,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.StabilityAnalysisTranslation\",\n", - " input_grid={\n", - " \"model\": [vm_embedder_small, vm_embedder_large],\n", - " \"dataset\": [vm_test_ds],\n", - " },\n", - " params={\n", - " \"source_lang\": \"en\",\n", - " \"target_lang\": \"fr\",\n", - " \"mean_similarity_threshold\": 0.7,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.CosineSimilarityHeatmap\",\n", - " input_grid={\n", - " \"model\": [vm_embedder_small, vm_embedder_large],\n", - " \"dataset\": [vm_test_ds],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.CosineSimilarityDistribution\",\n", - " input_grid={\n", - " \"model\": [vm_embedder_small, vm_embedder_large],\n", - " \"dataset\": [vm_test_ds],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.PCAComponentsPairwisePlots\",\n", - " input_grid={\n", - " \"model\": [vm_embedder_small, vm_embedder_large],\n", - " \"dataset\": [vm_test_ds],\n", - " },\n", - " params={\n", - " \"n_components\": 3,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup Vector Store\n", - "\n", - "Great, so now that we have assessed our embedding model and verified that it is performing well, we can go ahead and use it to compute embeddings for our question-answer pairs in the \"train\" set. We will then use these embeddings to insert the question-answer pairs into a vector store. We will use an in-memory `qdrant` vector database for demo purposes but any option would work just as well here. We will use the `QdrantClient` class from LangChain to interact with the vector store. This class will allow us to insert and search for embeddings in the vector store." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Generate embeddings for the Train Set\n", - "\n", - "We can use the same `assign_predictions` method from earlier except this time we will use the `vm_train_ds` object to compute the embeddings for the question-answer pairs in the \"train\" set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_train_ds.assign_predictions(vm_embedder_small)\n", - "print(vm_train_ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Insert embeddings and questions into Vector DB\n", - "\n", - "Now that we have computed the embeddings for our question-answer pairs in the \"train\" set, we can go ahead and insert them into the vector store:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.vectorstores import Qdrant\n", - "from langchain_community.document_loaders import DataFrameLoader\n", - "\n", - "# load documents from dataframe\n", - "loader = DataFrameLoader(train_df, page_content_column=\"question\")\n", - "docs = loader.load()\n", - "\n", - "# setup vector datastore\n", - "qdrant = Qdrant.from_documents(\n", - " docs,\n", - " embedding_small_client,\n", - " location=\":memory:\", # Local mode with in-memory storage only\n", - " collection_name=\"rfp_rag_collection\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Retrieval Model\n", - "\n", - "Now that we have an embedding model and a vector database setup and loaded with our data, we need a Retrieval model that can search for similar question-answer pairs for a given input question. Once created, we can initialize this as a ValidMind model and `assign_predictions` to it just like our embedding model. In this example, we'll create two retrieval models with different `k` parameters (the number of documents retrieved) to benchmark and compare their performance. This approach allows us to evaluate how retrieval depth affects the overall system quality." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "def retrieve(input):\n", - " contexts = []\n", - "\n", - " for result in qdrant.similarity_search_with_score(input[\"question\"], k=5):\n", - " document, score = result\n", - " context = f\"Q: {document.page_content}\\n\"\n", - " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", - "\n", - " contexts.append(context)\n", - "\n", - " return contexts\n", - "\n", - "\n", - "vm_retriever_k5 = vm.init_model(input_id=\"retrieval_k5_model\", predict_fn=retrieve)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "def retrieve(input):\n", - " contexts = []\n", - "\n", - " for result in qdrant.similarity_search_with_score(input[\"question\"], k=10):\n", - " document, score = result\n", - " context = f\"Q: {document.page_content}\\n\"\n", - " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", - "\n", - " contexts.append(context)\n", - "\n", - " return contexts\n", - "\n", - "\n", - "vm_retriever_k10 = vm.init_model(input_id=\"retrieval_k10_model\", predict_fn=retrieve)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds.assign_predictions(model=vm_retriever_k5)\n", - "vm_test_ds.assign_predictions(model=vm_retriever_k10)\n", - "print(vm_test_ds)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generation Model\n", - "\n", - "As the final piece of this simple RAG pipeline, we can create and initialize a generation model that will use the retrieved context to generate an answer to the input question. We will use the `gpt-3.5-turbo` and `gpt-4o` models from OpenAI. Since we have two retrieval models (with different `k` values) and want to test two different LLMs, we'll create a total of four generator models - pairing each retrieval configuration with each LLM to comprehensively evaluate how both retrieval depth and model capability affect response quality." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI\n", - "\n", - "from validmind.models import Prompt\n", - "\n", - "\n", - "system_prompt = \"\"\"\n", - "You are an expert RFP AI assistant.\n", - "You are tasked with answering new RFP questions based on existing RFP questions and answers.\n", - "You will be provided with the existing RFP questions and answer pairs that are the most relevant to the new RFP question.\n", - "After that you will be provided with a new RFP question.\n", - "You will generate an answer and respond only with the answer.\n", - "Ignore your pre-existing knowledge and answer the question based on the provided context.\n", - "\"\"\".strip()\n", - "\n", - "openai_client = OpenAI()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "def generate(input):\n", - " \n", - " response = openai_client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k5_model\"])},\n", - " {\"role\": \"user\", \"content\": input[\"question\"]},\n", - " ],\n", - " )\n", - " \n", - " return response.choices[0].message.content\n", - "\n", - "\n", - "vm_generator_k5_gpt35 = vm.init_model(\n", - " input_id=\"generation_k5_gpt35_model\",\n", - " predict_fn=generate,\n", - " prompt=Prompt(template=system_prompt),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "def generate(input):\n", - " response = openai_client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k10_model\"])},\n", - " {\"role\": \"user\", \"content\": input[\"question\"]},\n", - " ],\n", - " )\n", - "\n", - " return response.choices[0].message.content\n", - "\n", - "\n", - "vm_generator_k10_gpt35 = vm.init_model(\n", - " input_id=\"generation_k10_gpt35_model\",\n", - " predict_fn=generate,\n", - " prompt=Prompt(template=system_prompt),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "def generate(input):\n", - " \n", - " response = openai_client.chat.completions.create(\n", - " model=\"gpt-4o\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k5_model\"])},\n", - " {\"role\": \"user\", \"content\": input[\"question\"]},\n", - " ],\n", - " )\n", - " \n", - " return response.choices[0].message.content\n", - "\n", - "\n", - "vm_generator_k5_gpt4o = vm.init_model(\n", - " input_id=\"generation_k5_gpt4o_model\",\n", - " predict_fn=generate,\n", - " prompt=Prompt(template=system_prompt),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def generate(input):\n", - " response = openai_client.chat.completions.create(\n", - " model=\"gpt-4o\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k10_model\"])},\n", - " {\"role\": \"user\", \"content\": input[\"question\"]},\n", - " ],\n", - " )\n", - "\n", - " return response.choices[0].message.content\n", - "\n", - "\n", - "vm_generator_k10_gpt4o = vm.init_model(\n", - " input_id=\"generation_k10_gpt4o_model\",\n", - " predict_fn=generate,\n", - " prompt=Prompt(template=system_prompt),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's test it out real quick:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "vm_generator_k5_gpt35.predict(\n", - " pd.DataFrame(\n", - " {\"retrieval_k5_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_generator_k5_gpt4o.predict(\n", - " pd.DataFrame(\n", - " {\"retrieval_k5_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Prompt Evaluation\n", - "\n", - "Now that we have our generator model initialized, we can run some LLM-as-Judge tests to evaluate the system prompt. This will allow us to get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", - "\n", - "- **Examplar Bias**: When using multi-shot prompting, does the prompt contain an unbiased distribution of examples?\n", - "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", - "- **Clarity**: How clearly the prompt states the task.\n", - "- **Conciseness**: How succinctly the prompt states the task.\n", - "- **Instruction Framing**: Whether the prompt contains negative instructions.\n", - "- **Specificity**: How specific the prompt defines the task." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Bias\",\n", - " inputs={\n", - " \"model\": vm_generator_k5_gpt4o,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Clarity\",\n", - " inputs={\n", - " \"model\": vm_generator_k5_gpt4o,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Conciseness\",\n", - " inputs={\n", - " \"model\": vm_generator_k5_gpt4o,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Delimitation\",\n", - " inputs={\n", - " \"model\": vm_generator_k5_gpt4o,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.NegativeInstruction\",\n", - " inputs={\n", - " \"model\": vm_generator_k5_gpt4o,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Specificity\",\n", - " inputs={\n", - " \"model\": vm_generator_k5_gpt4o,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup RAG Pipeline Model\n", - "\n", - "Now that we have all of our individual \"component\" models setup and initialized we need some way to put them all together in a single \"pipeline\". We can use the `PipelineModel` class to do this. This ValidMind model type simply wraps any number of other ValidMind models and runs them in sequence. We can use a pipe(`|`) operator - in Python this is normally an `or` operator but we have overloaded it for easy pipeline creation - to chain together our models. We can then initialize this pipeline model and assign predictions to it just like any other model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_rag_k5_gpt35_model = vm.init_model(vm_retriever_k5 | vm_generator_k5_gpt35, input_id=\"rag_k5_gpt35_model\")\n", - "vm_rag_k10_gpt35_model = vm.init_model(vm_retriever_k10 | vm_generator_k10_gpt35, input_id=\"rag_k10_gpt35_model\")\n", - "vm_rag_k5_gpt4o_model = vm.init_model(vm_retriever_k5 | vm_generator_k5_gpt4o, input_id=\"rag_k5_gpt4o_model\")\n", - "vm_rag_k10_gpt4o_model = vm.init_model(vm_retriever_k10 | vm_generator_k10_gpt4o, input_id=\"rag_k10_gpt4o_model\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can `assign_predictions` to the pipeline model just like we did with the individual models. This will run the pipeline on the test set and store the results in the test set for later use." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds.assign_predictions(model=vm_rag_k5_gpt35_model)\n", - "vm_test_ds.assign_predictions(model=vm_rag_k10_gpt35_model)\n", - "vm_test_ds.assign_predictions(model=vm_rag_k5_gpt4o_model)\n", - "vm_test_ds.assign_predictions(model=vm_rag_k10_gpt4o_model)\n", - "print(vm_test_ds)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds._df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Run tests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## RAGAS evaluation\n", - "\n", - "Let's go ahead and run some of our new RAG tests against our model...\n", - "\n", - "> Note: these tests are still being developed and are not yet in a stable state. We are using advanced tests here that use LLM-as-Judge and other strategies to assess things like the relevancy of the retrieved context to the input question and the correctness of the generated answer when compared to the ground truth. There is more to come in this area so stay tuned!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Semantic Similarity\n", - "\n", - "The concept of Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated answer and the ground truth. This evaluation is based on the ground truth and the answer, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated answer and the ground truth.\n", - "\n", - "Measuring the semantic similarity between answers can offer valuable insights into the quality of the generated response. This evaluation utilizes a cross-encoder model to calculate the semantic similarity score." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.SemanticSimilarity\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " \"reference_column\": [\"ground_truth\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Entity Recall\n", - "\n", - "This test gives the measure of recall of the retrieved context, based on the number of entities present in both ground_truths and contexts relative to the number of entities present in the ground_truths alone. Simply put, it is a measure of what fraction of entities are recalled from ground_truths. This test is useful in fact-based use cases like tourism help desk, historical QA, etc. This test can help evaluate the retrieval mechanism for entities, based on comparison with entities present in ground_truths, because in cases where entities matter, we need the contexts which cover them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextEntityRecall\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"reference_column\": [\"ground_truth\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Precision\n", - "\n", - "Context Precision is a test that evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This test is computed using the question, ground_truth and the contexts, with values ranging between 0 and 1, where higher scores indicate better precision." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextPrecision\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", - " \"reference_column\": [\"ground_truth\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Precision Without Reference\n", - "\n", - "This test evaluates whether retrieved contexts align well with the expected response for a given user input, without requiring a ground-truth reference. This test assesses the relevance of each retrieved context chunk by comparing it directly to the response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid=[\n", - " {\"user_input_column\": \"question\",\n", - " \"retrieved_contexts_column\": \"retrieval_k5_model_prediction\",\n", - " \"response_column\": \"rag_k5_gpt4o_model_prediction\"\n", - " },\n", - " {\"user_input_column\": \"question\",\n", - " \"retrieved_contexts_column\": \"retrieval_k10_model_prediction\",\n", - " \"response_column\": \"rag_k10_gpt4o_model_prediction\"\n", - " },\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Faithfulness\n", - "\n", - "This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. The answer is scaled to (0,1) range. Higher the better.\n", - "\n", - "The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.Faithfulness\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Response Relevancy\n", - "\n", - "The Response Relevancy test, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information and higher scores indicate better relevancy. This test is computed using the question, the context and the answer.\n", - "\n", - "The Response Relevancy is defined as the mean cosine similartiy of the original question to a number of artifical questions, which where generated (reverse engineered) based on the answer.\n", - "\n", - "Please note, that eventhough in practice the score will range between 0 and 1 most of the time, this is not mathematically guranteed, due to the nature of the cosine similarity ranging from -1 to 1.\n", - "\n", - "> Note: This is a reference free test. If you’re looking to compare ground truth answer with generated answer refer to Answer Correctness.\n", - "\n", - "An answer is deemed relevant when it directly and appropriately addresses the original question. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the answer lacks completeness or contains redundant details. To calculate this score, the LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured. The underlying idea is that if the generated answer accurately addresses the initial question, the LLM should be able to generate questions from the answer that align with the original question." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Recall\n", - "\n", - "Context recall measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. It is computed based on the ground truth and the retrieved context, and the values range between 0 and 1, with higher values indicating better performance.\n", - "\n", - "To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextRecall\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", - " \"reference_column\": [\"ground_truth\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Answer Correctness\n", - "\n", - "The assessment of Answer Correctness involves gauging the accuracy of the generated answer when compared to the ground truth. This evaluation relies on the ground truth and the answer, with scores ranging from 0 to 1. A higher score indicates a closer alignment between the generated answer and the ground truth, signifying better correctness.\n", - "\n", - "Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score.\n", - "\n", - "Factual correctness quantifies the factual overlap between the generated answer and the ground truth answer. This is done using the concepts of:\n", - "\n", - "- TP (True Positive): Facts or statements that are present in both the ground truth and the generated answer.\n", - "- FP (False Positive): Facts or statements that are present in the generated answer but not in the ground truth.\n", - "- FN (False Negative): Facts or statements that are present in the ground truth but not in the generated answer." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.AnswerCorrectness\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " \"reference_column\": [\"ground_truth\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Aspect Critic\n", - "\n", - "This is designed to assess submissions based on predefined aspects such as harmlessness and correctness. Additionally, users have the flexibility to define their own aspects for evaluating submissions according to their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not. This evaluation is performed using the ‘answer’ as input.\n", - "\n", - "Critiques within the LLM evaluators evaluate submissions based on the provided aspect. Ragas Critiques offers a range of predefined aspects like correctness, harmfulness, etc. Users can also define their own aspects for evaluating submissions based on their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.AspectCritic\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Noise Sensitivity\n", - "\n", - "This test is designed to evaluate the robustness of the RAG pipeline model against noise in the retrieved context. It works by checking how well the \"claims\" in the generated answer match up with the \"claims\" in the ground truth answer. If the generated answer contains \"claims\" from the contexts that the ground truth answer does not contain, those claims are considered incorrect. The score for each answer is the number of incorrect claims divided by the total number of claims. This *can* be interpreted as a measure of how sensitive the LLM is to \"noise\" in the context where \"noise\" is information that is relevant but should not be included in the answer since the ground truth answer does not contain it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.NoiseSensitivity\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " \"reference_column\": [\"ground_truth\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Generation quality\n", - "\n", - "In this section, we evaluate the alignment and relevance of generated responses to reference outputs within our retrieval-augmented generation (RAG) application. We use metrics that assess various quality dimensions of the generated responses, including semantic similarity, structural alignment, and phrasing overlap. Semantic similarity metrics compare embeddings of generated and reference text to capture deeper contextual alignment, while overlap and alignment measures quantify how well the phrasing and structure of generated responses match the intended outputs." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Token Disparity\n", - "\n", - "This test assesses the difference in token counts between the reference texts (ground truth) and the answers generated by the RAG model. It helps evaluate how well the model's outputs align with the expected length and level of detail in the reference texts. A significant disparity in token counts could signal issues with generation quality, such as excessive verbosity or insufficient detail. Consistently low token counts in generated answers compared to references might suggest that the model’s outputs are incomplete or overly concise, missing important contextual information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.TokenDisparity\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### ROUGE Score\n", - "\n", - "This test evaluates the quality of answers generated by the RAG model by measuring overlaps in n-grams, word sequences, and word pairs between the model output and the reference (ground truth) text. ROUGE, short for Recall-Oriented Understudy for Gisting Evaluation, assesses both precision and recall, providing a balanced view of how well the generated response captures the reference content. ROUGE precision measures the proportion of n-grams in the generated text that match the reference, highlighting relevance and conciseness, while ROUGE recall assesses the proportion of reference n-grams present in the generated text, indicating completeness and thoroughness. \n", - "\n", - "Low precision scores might reveal that the generated text includes redundant or irrelevant information, while low recall scores suggest omissions of essential details from the reference. Consistently low ROUGE scores could indicate poor overall alignment with the ground truth, suggesting the model may be missing key content or failing to capture the intended meaning." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.RougeScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - " params={\n", - " \"metric\": \"rouge-1\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### BLEU Score\n", - "\n", - "The BLEU Score test evaluates the quality of answers generated by the RAG model by measuring n-gram overlap between the generated text and the reference (ground truth) text, with a specific focus on exact precision in phrasing. While ROUGE precision also assesses overlap, BLEU differs in two main ways: first, it applies a geometric average across multiple n-gram levels, capturing precise phrase alignment, and second, it includes a brevity penalty to prevent overly short outputs from inflating scores artificially. This added precision focus is valuable in RAG applications where strict adherence to reference language is essential, as BLEU emphasizes the match to exact phrasing. In contrast, ROUGE precision evaluates general content overlap without penalizing brevity, offering a broader sense of content alignment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.BleuScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### BERT Score\n", - "\n", - "This test evaluates the quality of the RAG generated answers using BERT embeddings to measure precision, recall, and F1 scores based on semantic similarity, rather than exact n-gram matches as in BLEU and ROUGE. This approach captures contextual meaning, making it valuable when wording differs but the intended message closely aligns with the reference. In RAG applications, the BERT score is especially useful for ensuring that generated answers convey the reference text’s meaning, even if phrasing varies. Consistently low scores indicate a lack of semantic alignment, suggesting the model may miss or misrepresent key content. Low precision may reflect irrelevant or redundant details, while low recall can indicate omissions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.BertScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### METEOR Score\n", - "\n", - "This test evaluates the quality of the generated answers by measuring alignment with the ground truth, emphasizing both accuracy and fluency. Unlike BLEU and ROUGE, which focus on n-gram matches, METEOR combines precision, recall, synonym matching, and word order, focusing at how well the generated text conveys meaning and reads naturally. This metric is especially useful for RAG applications where sentence structure and natural flow are crucial for clear communication. Lower scores may suggest alignment issues, indicating that the answers may lack fluency or key content. Discrepancies in word order or high fragmentation penalties can reveal problems with how the model constructs sentences, potentially affecting readability." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.MeteorScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Bias and Toxicity\n", - "\n", - "In this section, we use metrics like Toxicity Score and Regard Score to evaluate both the generated responses and the ground truth. These tests helps us detect any harmful, offensive, or inappropriate language and evaluate the level of bias and neutrality enabling us to assess and mitigate potential biases in both the model's responses and the original dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Toxicity Score\n", - "\n", - "This test measures the level of harmful or offensive content in the generated answers. The test uses a preloaded toxicity detection tool from Hugging Face, which identifies language that may be inappropriate, aggressive, or derogatory. High toxicity scores indicate potentially toxic content, while consistently elevated scores across multiple outputs may signal underlying issues in the model’s generation process that require attention to prevent the spread of harmful language." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ToxicityScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Regard Score\n", - "\n", - "This test evaluates the sentiment and perceived regard—categorized as positive, negative, neutral, or other—in answers generated by the RAG model. This is important for identifying any biases or sentiment tendencies in responses, ensuring that generated answers are balanced and appropriate for the context. The uses a preloaded regard evaluation tool from Hugging Face to compute scores for each response. High skewness in regard scores, especially if the generated responses consistently diverge from expected sentiments in the reference texts, may reveal biases in the model’s generation, such as overly positive or negative tones where neutrality is expected." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.RegardScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Conclusion\n", - "\n", - "In this notebook, we have seen how we can use LangChain and ValidMind together to build, evaluate and document a simple RAG Model as its developed. This is a great example of the interactive development experience that ValidMind is designed to support. We can quickly iterate on our model and document as we go... We have seen how ValidMind supports non-traditional \"models\" using a functional interface and how we can build pipelines of many models to support complex GenAI workflows.\n", - "\n", - "This is still a work in progress and we are actively developing new tests to support more advanced GenAI workflows. We are also keeping an eye on the most popular GenAI models and libraries to explore direct integrations. Stay tuned for more updates and new features in this area!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Upgrade ValidMind\n", - "\n", - "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", - "\n", - "Retrieve the information for the currently installed version of ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip show validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", - "\n", - "```bash\n", - "%pip install --upgrade validmind\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You may need to restart your kernel after running the upgrade package for changes to be applied." - ] - }, - { - "cell_type": "markdown", - "id": "copyright-09e315440ca84258abe1aaefaca3a3d0", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ValidMind Library", - "language": "python", - "name": "validmind" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/site/notebooks/code_samples/nlp_and_llm/rag_documentation_demo.ipynb b/site/notebooks/code_samples/nlp_and_llm/rag_documentation_demo.ipynb deleted file mode 100644 index 485c81ffdb..0000000000 --- a/site/notebooks/code_samples/nlp_and_llm/rag_documentation_demo.ipynb +++ /dev/null @@ -1,1681 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# RAG Model Documentation Demo\n", - "\n", - "In this notebook, we are going to implement a simple RAG Model for automating the process of answering RFP questions using GenAI. We will see how we can initialize an embedding model, a retrieval model and a generator model with LangChain components and use them within the ValidMind Library to run tests against them. Finally, we will see how we can put them together in a Pipeline and run that to get e2e results and run tests against that." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - "- [About ValidMind](#toc1__) \n", - " - [Before you begin](#toc1_1__) \n", - " - [New to ValidMind?](#toc1_2__) \n", - " - [Key concepts](#toc1_3__) \n", - "- [Setting up](#toc2__) \n", - " - [Initialize the ValidMind Library](#toc2_1__) \n", - " - [Register sample model](#toc2_1_1__) \n", - " - [Apply documentation template](#toc2_1_2__) \n", - " - [Get your code snippet](#toc2_1_3__) \n", - "- [Read Open AI API Key](#toc3__) \n", - "- [Dataset Loader](#toc4__) \n", - "- [Data validation](#toc5__) \n", - " - [Duplicates](#toc5_1__) \n", - " - [Stop Words](#toc5_2__) \n", - " - [Punctuations](#toc5_3__) \n", - " - [Common Words](#toc5_4__) \n", - " - [Language Detection](#toc5_5__) \n", - " - [Toxicity Score](#toc5_6__) \n", - " - [Polarity and Subjectivity](#toc5_7__) \n", - " - [Sentiment](#toc5_8__) \n", - " - [Assign Predictions](#toc5_9__) \n", - " - [Run tests](#toc5_10__) \n", - " - [Generate embeddings for the Train Set](#toc5_11__) \n", - " - [Insert embeddings and questions into Vector DB](#toc5_12__) \n", - "- [Prompt Evaluation](#toc6__) \n", - "- [RAGAS evaluation](#toc7__) \n", - " - [Semantic Similarity](#toc7_1__) \n", - " - [Context Entity Recall](#toc7_2__) \n", - " - [Context Precision](#toc7_3__) \n", - " - [Context Precision Without Reference](#toc7_4__) \n", - " - [Faithfulness](#toc7_5__) \n", - " - [Response Relevancy](#toc7_6__) \n", - " - [Context Recall](#toc7_7__) \n", - " - [Answer Correctness](#toc7_8__) \n", - " - [Aspect Critic](#toc7_9__) \n", - " - [Noise Sensitivity](#toc7_10__) \n", - "- [Generation quality](#toc8__) \n", - " - [Token Disparity](#toc8_1__) \n", - " - [ROUGE Score](#toc8_2__) \n", - " - [BLEU Score](#toc8_3__) \n", - " - [BERT Score](#toc8_4__) \n", - " - [METEOR Score](#toc8_5__) \n", - "- [Bias and Toxicity](#toc9__) \n", - " - [Toxicity Score](#toc9_1__) \n", - " - [Regard Score](#toc9_2__) \n", - "- [Upgrade ValidMind](#toc10__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
\n", - "\n", - "\n", - "\n", - "### Key concepts\n", - "\n", - "- **FunctionModels**: ValidMind offers support for creating `VMModel` instances from Python functions. This enables us to support any \"model\" by simply using the provided function as the model's `predict` method.\n", - "- **PipelineModels**: ValidMind models (`VMModel` instances) of any type can be piped together to create a model pipeline. This allows model components to be created and tested/documented independently, and then combined into a single model for end-to-end testing and documentation. We use the `|` operator to pipe models together.\n", - "- **RAG**: RAG stands for Retrieval Augmented Generation and refers to a wide range of GenAI applications where some form of retrieval is used to add context to the prompt so that the LLM that generates content can refer to it when creating its output. In this notebook, we are going to implement a simple RAG setup using LangChain components." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Prerequisites\n", - "\n", - "Let's go ahead and install the `validmind` library if its not already installed... Then we can install the `qdrant-client` library for our vector store and `langchain` for everything else:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q validmind" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q qdrant-client langchain langchain-openai sentencepiece" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Analytics`\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Gen AI RAG Template`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Read Open AI API Key\n", - "\n", - "We will need to have an OpenAI API key to be able to use their `text-embedding-3-small` model for our embeddings, `gpt-3.5-turbo` model for our generator and `gpt-4o` model for our LLM-as-Judge tests. If you don't have an OpenAI API key, you can get one by signing up at [OpenAI](https://platform.openai.com/signup). Then you can create a `.env` file in the root of your project and the following cell will load it from there. Alternatively, you can just uncomment the line below to directly set the key (not recommended for security reasons)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load openai api key\n", - "import os\n", - "\n", - "import dotenv\n", - "import nltk\n", - "\n", - "dotenv.load_dotenv()\n", - "nltk.download('stopwords')\n", - "nltk.download('punkt_tab')\n", - "\n", - "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", - "\n", - "if not \"OPENAI_API_KEY\" in os.environ:\n", - " raise ValueError(\"OPENAI_API_KEY is not set\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Dataset Loader\n", - "\n", - "Great, now that we have all of our dependencies installed, the ValidMind Library initialized and connected to our model and our OpenAI API key setup, we can go ahead and load our datasets. We will use the synthetic `RFP` dataset included with ValidMind for this notebook. This dataset contains a variety of RFP questions and ground truth answers that we can use both as the source where our Retriever will search for similar question-answer pairs as well as our test set for evaluating the performance of our RAG model. To do this, we just have to load it and call the preprocess function to get a split of the data into train and test sets." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Import the sample dataset from the library\n", - "from validmind.datasets.llm.rag import rfp\n", - "\n", - "raw_df = rfp.load_data()\n", - "train_df, test_df = rfp.preprocess(raw_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_train_ds = vm.init_dataset(\n", - " train_df,\n", - " text_column=\"question\",\n", - " target_column=\"ground_truth\",\n", - ")\n", - "\n", - "vm_test_ds = vm.init_dataset(\n", - " test_df,\n", - " text_column=\"question\",\n", - " target_column=\"ground_truth\",\n", - ")\n", - "\n", - "vm_test_ds.df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Data validation\n", - "\n", - "Now that we have loaded our dataset, we can go ahead and run some data validation tests right away to start assessing and documenting the quality of our data. Since we are using a text dataset, we can use ValidMind's built-in array of text data quality tests to check that things like number of duplicates, missing values, and other common text data issues are not present in our dataset. We can also run some tests to check the sentiment and toxicity of our data." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Duplicates\n", - "\n", - "First, let's check for duplicates in our dataset. We can use the `validmind.data_validation.Duplicates` test and pass our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.tests import run_test\n", - "\n", - "run_test(\n", - " test_id=\"validmind.data_validation.Duplicates\",\n", - " inputs={\"dataset\": vm_train_ds},\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Stop Words\n", - "\n", - "Next, let's check for stop words in our dataset. We can use the `validmind.data_validation.StopWords` test and pass our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.StopWords\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Punctuations\n", - "\n", - "Next, let's check for punctuations in our dataset. We can use the `validmind.data_validation.Punctuations` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.Punctuations\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Common Words\n", - "\n", - "Next, let's check for common words in our dataset. We can use the `validmind.data_validation.CommonWord` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.CommonWords\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Language Detection\n", - "\n", - "For documentation purposes, we can detect and log the languages used in the dataset with the `validmind.data_validation.LanguageDetection` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.LanguageDetection\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Toxicity Score\n", - "\n", - "Now, let's go ahead and run the `validmind.data_validation.nlp.Toxicity` test to compute a toxicity score for our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.Toxicity\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Polarity and Subjectivity\n", - "\n", - "We can also run the `validmind.data_validation.nlp.PolarityAndSubjectivity` test to compute the polarity and subjectivity of our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.PolarityAndSubjectivity\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Sentiment\n", - "\n", - "Finally, we can run the `validmind.data_validation.nlp.Sentiment` test to plot the sentiment of our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.Sentiment\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Embedding Model\n", - "\n", - "Now that we have our dataset loaded and have run some data validation tests to assess and document the quality of our data, we can go ahead and initialize our embedding model. We will use the `text-embedding-3-small` model from OpenAI for this purpose wrapped in the `OpenAIEmbeddings` class from LangChain. This model will be used to \"embed\" our questions both for inserting the question-answer pairs from the \"train\" set into the vector store and for embedding the question from inputs when making predictions with our RAG model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_openai import OpenAIEmbeddings\n", - "\n", - "embedding_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", - "\n", - "\n", - "def embed(input):\n", - " \"\"\"Returns a text embedding for the given text\"\"\"\n", - " return embedding_client.embed_query(input[\"question\"])\n", - "\n", - "\n", - "vm_embedder = vm.init_model(input_id=\"embedding_model\", predict_fn=embed)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "What we have done here is to initialize the `OpenAIEmbeddings` class so it uses OpenAI's `text-embedding-3-small` model. We then created an `embed` function that takes in an `input` dictionary and uses the `embed_query` method of the embedding client to compute the embeddings of the `question`. We use an `embed` function since that is how ValidMind supports any custom model. We will use this strategy for the retrieval and generator models as well but you could also use, say, a HuggingFace model directly. See the documentation for more information on which model types are directly supported - [ValidMind Documentation](https://docs.validmind.ai/validmind/validmind.html)... Finally, we use the `init_model` function from the ValidMind Library to create a `VMModel` object that can be used in ValidMind tests. This also logs the model to our model documentation and any test that uses the model will be linked to the logged model and its metadata." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Assign Predictions\n", - "\n", - "To precompute the embeddings for our test set, we can call the `assign_predictions` method of our `vm_test_ds` object we created above. This will compute the embeddings for each question in the test set and store them in the a special prediction column of the test set thats linked to our `vm_embedder` model. This will allow us to use these embeddings later when we run tests against our embedding model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds.assign_predictions(vm_embedder)\n", - "print(vm_test_ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Run tests\n", - "\n", - "Now that everything is setup for the embedding model, we can go ahead and run some tests to assess and document the quality of our embeddings. We will use the `validmind.model_validation.embeddings.*` tests to compute a variety of metrics against our model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.StabilityAnalysisRandomNoise\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - " params={\"probability\": 0.3},\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.StabilityAnalysisSynonyms\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - " params={\"probability\": 0.3},\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.StabilityAnalysisTranslation\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - " params={\n", - " \"source_lang\": \"en\",\n", - " \"target_lang\": \"fr\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.CosineSimilarityHeatmap\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "run_test(\n", - " \"validmind.model_validation.embeddings.CosineSimilarityDistribution\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.EuclideanDistanceHeatmap\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.PCAComponentsPairwisePlots\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - " params={\"n_components\": 3},\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.TSNEComponentsPairwisePlots\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - " params={\"n_components\": 3, \"perplexity\": 20},\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup Vector Store\n", - "\n", - "Great, so now that we have assessed our embedding model and verified that it is performing well, we can go ahead and use it to compute embeddings for our question-answer pairs in the \"train\" set. We will then use these embeddings to insert the question-answer pairs into a vector store. We will use an in-memory `qdrant` vector database for demo purposes but any option would work just as well here. We will use the `QdrantClient` class from LangChain to interact with the vector store. This class will allow us to insert and search for embeddings in the vector store." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Generate embeddings for the Train Set\n", - "\n", - "We can use the same `assign_predictions` method from earlier except this time we will use the `vm_train_ds` object to compute the embeddings for the question-answer pairs in the \"train\" set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_train_ds.assign_predictions(vm_embedder)\n", - "print(vm_train_ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Insert embeddings and questions into Vector DB\n", - "\n", - "Now that we have computed the embeddings for our question-answer pairs in the \"train\" set, we can go ahead and insert them into the vector store:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.vectorstores import Qdrant\n", - "from langchain_openai import OpenAIEmbeddings\n", - "from langchain_community.document_loaders import DataFrameLoader\n", - "\n", - "# load documents from dataframe\n", - "loader = DataFrameLoader(train_df, page_content_column=\"question\")\n", - "docs = loader.load()\n", - "# choose model using embedding client\n", - "embedding_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", - "\n", - "# setup vector datastore\n", - "qdrant = Qdrant.from_documents(\n", - " docs,\n", - " embedding_client,\n", - " location=\":memory:\", # Local mode with in-memory storage only\n", - " collection_name=\"rfp_rag_collection\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Retrieval Model\n", - "\n", - "Now that we have an embedding model and a vector database setup and loaded with our data, we need a Retrieval model that can search for similar question-answer pairs for a given input question. Once created, we can initialize this as a ValidMind model and `assign_predictions` to it just like our embedding model." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "def retrieve(input):\n", - " contexts = []\n", - "\n", - " for result in qdrant.similarity_search_with_score(input[\"question\"]):\n", - " document, score = result\n", - " context = f\"Q: {document.page_content}\\n\"\n", - " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", - "\n", - " contexts.append(context)\n", - "\n", - " return contexts\n", - "\n", - "\n", - "vm_retriever = vm.init_model(input_id=\"retrieval_model\", predict_fn=retrieve)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds.assign_predictions(model=vm_retriever)\n", - "print(vm_test_ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generation Model\n", - "\n", - "As the final piece of this simple RAG pipeline, we can create and initialize a generation model that will use the retrieved context to generate an answer to the input question. We will use the `gpt-3.5-turbo` model from OpenAI." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI\n", - "\n", - "from validmind.models import Prompt\n", - "\n", - "\n", - "system_prompt = \"\"\"\n", - "You are an expert RFP AI assistant.\n", - "You are tasked with answering new RFP questions based on existing RFP questions and answers.\n", - "You will be provided with the existing RFP questions and answer pairs that are the most relevant to the new RFP question.\n", - "After that you will be provided with a new RFP question.\n", - "You will generate an answer and respond only with the answer.\n", - "Ignore your pre-existing knowledge and answer the question based on the provided context.\n", - "\"\"\".strip()\n", - "\n", - "openai_client = OpenAI()\n", - "\n", - "\n", - "def generate(input):\n", - " response = openai_client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_model\"])},\n", - " {\"role\": \"user\", \"content\": input[\"question\"]},\n", - " ],\n", - " )\n", - "\n", - " return response.choices[0].message.content\n", - "\n", - "\n", - "vm_generator = vm.init_model(\n", - " input_id=\"generation_model\",\n", - " predict_fn=generate,\n", - " prompt=Prompt(template=system_prompt),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's test it out real quick:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "vm_generator.predict(\n", - " pd.DataFrame(\n", - " {\"retrieval_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Prompt Evaluation\n", - "\n", - "Now that we have our generator model initialized, we can run some LLM-as-Judge tests to evaluate the system prompt. This will allow us to get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", - "\n", - "- **Examplar Bias**: When using multi-shot prompting, does the prompt contain an unbiased distribution of examples?\n", - "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", - "- **Clarity**: How clearly the prompt states the task.\n", - "- **Conciseness**: How succinctly the prompt states the task.\n", - "- **Instruction Framing**: Whether the prompt contains negative instructions.\n", - "- **Specificity**: How specific the prompt defines the task." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Bias\",\n", - " inputs={\n", - " \"model\": vm_generator,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Clarity\",\n", - " inputs={\n", - " \"model\": vm_generator,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Conciseness\",\n", - " inputs={\n", - " \"model\": vm_generator,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Delimitation\",\n", - " inputs={\n", - " \"model\": vm_generator,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.NegativeInstruction\",\n", - " inputs={\n", - " \"model\": vm_generator,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Specificity\",\n", - " inputs={\n", - " \"model\": vm_generator,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup RAG Pipeline Model\n", - "\n", - "Now that we have all of our individual \"component\" models setup and initialized we need some way to put them all together in a single \"pipeline\". We can use the `PipelineModel` class to do this. This ValidMind model type simply wraps any number of other ValidMind models and runs them in sequence. We can use a pipe(`|`) operator - in Python this is normally an `or` operator but we have overloaded it for easy pipeline creation - to chain together our models. We can then initialize this pipeline model and assign predictions to it just like any other model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_rag_model = vm.init_model(vm_retriever | vm_generator, input_id=\"rag_model\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can `assign_predictions` to the pipeline model just like we did with the individual models. This will run the pipeline on the test set and store the results in the test set for later use." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds.assign_predictions(model=vm_rag_model)\n", - "print(vm_test_ds)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds._df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Run tests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## RAGAS evaluation\n", - "\n", - "Let's go ahead and run some of our new RAG tests against our model...\n", - "\n", - "> Note: these tests are still being developed and are not yet in a stable state. We are using advanced tests here that use LLM-as-Judge and other strategies to assess things like the relevancy of the retrieved context to the input question and the correctness of the generated answer when compared to the ground truth. There is more to come in this area so stay tuned!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Semantic Similarity\n", - "\n", - "The concept of Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated answer and the ground truth. This evaluation is based on the ground truth and the answer, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated answer and the ground truth.\n", - "\n", - "Measuring the semantic similarity between answers can offer valuable insights into the quality of the generated response. This evaluation utilizes a cross-encoder model to calculate the semantic similarity score." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.SemanticSimilarity\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"response_column\": \"rag_model_prediction\",\n", - " \"reference_column\": \"ground_truth\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Entity Recall\n", - "\n", - "This test gives the measure of recall of the retrieved context, based on the number of entities present in both ground_truths and contexts relative to the number of entities present in the ground_truths alone. Simply put, it is a measure of what fraction of entities are recalled from ground_truths. This test is useful in fact-based use cases like tourism help desk, historical QA, etc. This test can help evaluate the retrieval mechanism for entities, based on comparison with entities present in ground_truths, because in cases where entities matter, we need the contexts which cover them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextEntityRecall\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"reference_column\": \"ground_truth\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Precision\n", - "\n", - "Context Precision is a test that evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This test is computed using the question, ground_truth and the contexts, with values ranging between 0 and 1, where higher scores indicate better precision." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextPrecision\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " \"reference_column\": \"ground_truth\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Precision Without Reference\n", - "\n", - "This test evaluates whether retrieved contexts align well with the expected response for a given user input, without requiring a ground-truth reference. This test assesses the relevance of each retrieved context chunk by comparing it directly to the response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " \"response_column\": \"rag_model_prediction\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Faithfulness\n", - "\n", - "This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. The answer is scaled to (0,1) range. Higher the better.\n", - "\n", - "The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.Faithfulness\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"response_column\": \"rag_model_prediction\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Response Relevancy\n", - "\n", - "The Response Relevancy test, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information and higher scores indicate better relevancy. This test is computed using the question, the context and the answer.\n", - "\n", - "The Response Relevancy is defined as the mean cosine similartiy of the original question to a number of artifical questions, which where generated (reverse engineered) based on the answer.\n", - "\n", - "Please note, that eventhough in practice the score will range between 0 and 1 most of the time, this is not mathematically guranteed, due to the nature of the cosine similarity ranging from -1 to 1.\n", - "\n", - "> Note: This is a reference free test. If you’re looking to compare ground truth answer with generated answer refer to Answer Correctness.\n", - "\n", - "An answer is deemed relevant when it directly and appropriately addresses the original question. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the answer lacks completeness or contains redundant details. To calculate this score, the LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured. The underlying idea is that if the generated answer accurately addresses the initial question, the LLM should be able to generate questions from the answer that align with the original question." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"response_column\": \"rag_model_prediction\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Recall\n", - "\n", - "Context recall measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. It is computed based on the ground truth and the retrieved context, and the values range between 0 and 1, with higher values indicating better performance.\n", - "\n", - "To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextRecall\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " \"reference_column\": \"ground_truth\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Answer Correctness\n", - "\n", - "The assessment of Answer Correctness involves gauging the accuracy of the generated answer when compared to the ground truth. This evaluation relies on the ground truth and the answer, with scores ranging from 0 to 1. A higher score indicates a closer alignment between the generated answer and the ground truth, signifying better correctness.\n", - "\n", - "Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score.\n", - "\n", - "Factual correctness quantifies the factual overlap between the generated answer and the ground truth answer. This is done using the concepts of:\n", - "\n", - "- TP (True Positive): Facts or statements that are present in both the ground truth and the generated answer.\n", - "- FP (False Positive): Facts or statements that are present in the generated answer but not in the ground truth.\n", - "- FN (False Negative): Facts or statements that are present in the ground truth but not in the generated answer." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.AnswerCorrectness\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"response_column\": \"rag_model_prediction\",\n", - " \"reference_column\": \"ground_truth\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Aspect Critic\n", - "\n", - "This is designed to assess submissions based on predefined aspects such as harmlessness and correctness. Additionally, users have the flexibility to define their own aspects for evaluating submissions according to their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not. This evaluation is performed using the ‘answer’ as input.\n", - "\n", - "Critiques within the LLM evaluators evaluate submissions based on the provided aspect. Ragas Critiques offers a range of predefined aspects like correctness, harmfulness, etc. Users can also define their own aspects for evaluating submissions based on their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.AspectCritic\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"response_column\": \"rag_model_prediction\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Noise Sensitivity\n", - "\n", - "This test is designed to evaluate the robustness of the RAG pipeline model against noise in the retrieved context. It works by checking how well the \"claims\" in the generated answer match up with the \"claims\" in the ground truth answer. If the generated answer contains \"claims\" from the contexts that the ground truth answer does not contain, those claims are considered incorrect. The score for each answer is the number of incorrect claims divided by the total number of claims. This *can* be interpreted as a measure of how sensitive the LLM is to \"noise\" in the context where \"noise\" is information that is relevant but should not be included in the answer since the ground truth answer does not contain it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.NoiseSensitivity\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"response_column\": \"rag_model_prediction\",\n", - " \"reference_column\": \"ground_truth\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Generation quality\n", - "\n", - "In this section, we evaluate the alignment and relevance of generated responses to reference outputs within our retrieval-augmented generation (RAG) application. We use metrics that assess various quality dimensions of the generated responses, including semantic similarity, structural alignment, and phrasing overlap. Semantic similarity metrics compare embeddings of generated and reference text to capture deeper contextual alignment, while overlap and alignment measures quantify how well the phrasing and structure of generated responses match the intended outputs." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Token Disparity\n", - "\n", - "This test assesses the difference in token counts between the reference texts (ground truth) and the answers generated by the RAG model. It helps evaluate how well the model's outputs align with the expected length and level of detail in the reference texts. A significant disparity in token counts could signal issues with generation quality, such as excessive verbosity or insufficient detail. Consistently low token counts in generated answers compared to references might suggest that the model’s outputs are incomplete or overly concise, missing important contextual information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.TokenDisparity\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### ROUGE Score\n", - "\n", - "This test evaluates the quality of answers generated by the RAG model by measuring overlaps in n-grams, word sequences, and word pairs between the model output and the reference (ground truth) text. ROUGE, short for Recall-Oriented Understudy for Gisting Evaluation, assesses both precision and recall, providing a balanced view of how well the generated response captures the reference content. ROUGE precision measures the proportion of n-grams in the generated text that match the reference, highlighting relevance and conciseness, while ROUGE recall assesses the proportion of reference n-grams present in the generated text, indicating completeness and thoroughness. \n", - "\n", - "Low precision scores might reveal that the generated text includes redundant or irrelevant information, while low recall scores suggest omissions of essential details from the reference. Consistently low ROUGE scores could indicate poor overall alignment with the ground truth, suggesting the model may be missing key content or failing to capture the intended meaning." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.RougeScore\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - " params={\n", - " \"metric\": \"rouge-1\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### BLEU Score\n", - "\n", - "The BLEU Score test evaluates the quality of answers generated by the RAG model by measuring n-gram overlap between the generated text and the reference (ground truth) text, with a specific focus on exact precision in phrasing. While ROUGE precision also assesses overlap, BLEU differs in two main ways: first, it applies a geometric average across multiple n-gram levels, capturing precise phrase alignment, and second, it includes a brevity penalty to prevent overly short outputs from inflating scores artificially. This added precision focus is valuable in RAG applications where strict adherence to reference language is essential, as BLEU emphasizes the match to exact phrasing. In contrast, ROUGE precision evaluates general content overlap without penalizing brevity, offering a broader sense of content alignment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.BleuScore\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### BERT Score\n", - "\n", - "This test evaluates the quality of the RAG generated answers using BERT embeddings to measure precision, recall, and F1 scores based on semantic similarity, rather than exact n-gram matches as in BLEU and ROUGE. This approach captures contextual meaning, making it valuable when wording differs but the intended message closely aligns with the reference. In RAG applications, the BERT score is especially useful for ensuring that generated answers convey the reference text’s meaning, even if phrasing varies. Consistently low scores indicate a lack of semantic alignment, suggesting the model may miss or misrepresent key content. Low precision may reflect irrelevant or redundant details, while low recall can indicate omissions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.BertScore\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### METEOR Score\n", - "\n", - "This test evaluates the quality of the generated answers by measuring alignment with the ground truth, emphasizing both accuracy and fluency. Unlike BLEU and ROUGE, which focus on n-gram matches, METEOR combines precision, recall, synonym matching, and word order, focusing at how well the generated text conveys meaning and reads naturally. This metric is especially useful for RAG applications where sentence structure and natural flow are crucial for clear communication. Lower scores may suggest alignment issues, indicating that the answers may lack fluency or key content. Discrepancies in word order or high fragmentation penalties can reveal problems with how the model constructs sentences, potentially affecting readability." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.MeteorScore\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Bias and Toxicity\n", - "\n", - "In this section, we use metrics like Toxicity Score and Regard Score to evaluate both the generated responses and the ground truth. These tests helps us detect any harmful, offensive, or inappropriate language and evaluate the level of bias and neutrality enabling us to assess and mitigate potential biases in both the model's responses and the original dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Toxicity Score\n", - "\n", - "This test measures the level of harmful or offensive content in the generated answers. The test uses a preloaded toxicity detection tool from Hugging Face, which identifies language that may be inappropriate, aggressive, or derogatory. High toxicity scores indicate potentially toxic content, while consistently elevated scores across multiple outputs may signal underlying issues in the model’s generation process that require attention to prevent the spread of harmful language." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ToxicityScore\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Regard Score\n", - "\n", - "This test evaluates the sentiment and perceived regard—categorized as positive, negative, neutral, or other—in answers generated by the RAG model. This is important for identifying any biases or sentiment tendencies in responses, ensuring that generated answers are balanced and appropriate for the context. The uses a preloaded regard evaluation tool from Hugging Face to compute scores for each response. High skewness in regard scores, especially if the generated responses consistently diverge from expected sentiments in the reference texts, may reveal biases in the model’s generation, such as overly positive or negative tones where neutrality is expected." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.RegardScore\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Conclusion\n", - "\n", - "In this notebook, we have seen how we can use LangChain and ValidMind together to build, evaluate and document a simple RAG Model as its developed. This is a great example of the interactive development experience that ValidMind is designed to support. We can quickly iterate on our model and document as we go... We have seen how ValidMind supports non-traditional \"models\" using a functional interface and how we can build pipelines of many models to support complex GenAI workflows.\n", - "\n", - "This is still a work in progress and we are actively developing new tests to support more advanced GenAI workflows. We are also keeping an eye on the most popular GenAI models and libraries to explore direct integrations. Stay tuned for more updates and new features in this area!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Upgrade ValidMind\n", - "\n", - "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", - "\n", - "Retrieve the information for the currently installed version of ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip show validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", - "\n", - "```bash\n", - "%pip install --upgrade validmind\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You may need to restart your kernel after running the upgrade package for changes to be applied." - ] - }, - { - "cell_type": "markdown", - "id": "copyright-397fa35a68a34dc38f5d84d797fb5331", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "validmind-py3.10", - "language": "python", - "name": "validmind-py3.10" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/site/notebooks/how_to/dataset_image.png b/site/notebooks/how_to/data_and_datasets/dataset_image.png similarity index 100% rename from site/notebooks/how_to/dataset_image.png rename to site/notebooks/how_to/data_and_datasets/dataset_image.png diff --git a/site/notebooks/how_to/configure_dataset_features.ipynb b/site/notebooks/how_to/data_and_datasets/dataset_inputs/configure_dataset_features.ipynb similarity index 98% rename from site/notebooks/how_to/configure_dataset_features.ipynb rename to site/notebooks/how_to/data_and_datasets/dataset_inputs/configure_dataset_features.ipynb index 30d86a1ee0..86081ab36e 100644 --- a/site/notebooks/how_to/configure_dataset_features.ipynb +++ b/site/notebooks/how_to/data_and_datasets/dataset_inputs/configure_dataset_features.ipynb @@ -93,7 +93,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -158,8 +158,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/how_to/load_datasets_predictions.ipynb b/site/notebooks/how_to/data_and_datasets/dataset_inputs/load_datasets_predictions.ipynb similarity index 99% rename from site/notebooks/how_to/load_datasets_predictions.ipynb rename to site/notebooks/how_to/data_and_datasets/dataset_inputs/load_datasets_predictions.ipynb index a2dacdeea0..51f0dff6ef 100644 --- a/site/notebooks/how_to/load_datasets_predictions.ipynb +++ b/site/notebooks/how_to/data_and_datasets/dataset_inputs/load_datasets_predictions.ipynb @@ -113,7 +113,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -178,8 +178,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/how_to/model_image.png b/site/notebooks/how_to/data_and_datasets/model_image.png similarity index 100% rename from site/notebooks/how_to/model_image.png rename to site/notebooks/how_to/data_and_datasets/model_image.png diff --git a/site/notebooks/how_to/data_and_datasets/use_dataset_model_objects.ipynb b/site/notebooks/how_to/data_and_datasets/use_dataset_model_objects.ipynb new file mode 100644 index 0000000000..7ea025faa9 --- /dev/null +++ b/site/notebooks/how_to/data_and_datasets/use_dataset_model_objects.ipynb @@ -0,0 +1,1003 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction to ValidMind Dataset and Model Objects\n", + "\n", + "When writing custom tests, it is essential to be aware of the interfaces of the ValidMind Dataset and ValidMind Model, which are used as input arguments.\n", + "\n", + "As a model developer, writing custom tests is beneficial when the ValidMind library lacks a built-in test for your specific needs. For example, a model might require new tests to evaluate specific aspects of the model or dataset based on a particular use case.\n", + "\n", + "This interactive notebook offers a detailed understanding of ValidMind objects and their use in writing custom tests. It introduces various interfaces provided by these objects and demonstrates how they can be leveraged to implement tests effortlessly." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1__) \n", + " - [Before you begin](#toc1_1__) \n", + " - [New to ValidMind?](#toc1_2__) \n", + " - [Key concepts](#toc1_3__) \n", + "- [Setting up](#toc2__) \n", + " - [Install the ValidMind Library](#toc2_1__) \n", + " - [Initialize the ValidMind Library](#toc2_2__) \n", + " - [Register sample model](#toc2_2_1__) \n", + " - [Apply documentation template](#toc2_2_2__) \n", + " - [Get your code snippet](#toc2_2_3__) \n", + "- [Load the demo dataset](#toc3__) \n", + " - [Prepocess the raw dataset](#toc3_1__) \n", + "- [Train a model for testing](#toc4__) \n", + "- [Explore basic components of the ValidMind library](#toc5__) \n", + " - [VMDataset Object](#toc5_1__) \n", + " - [Initialize the ValidMind datasets](#toc5_1_1__) \n", + " - [ Interfaces of the dataset object](#toc5_1_2__) \n", + " - [Using VM Dataset object as arguments in custom tests](#toc5_2__) \n", + " - [Run the test](#toc5_2_1__) \n", + " - [Using VM Dataset object and parameters as arguments in custom tests](#toc5_3__) \n", + " - [VMModel Object](#toc5_4__) \n", + " - [Initialize ValidMind model object](#toc5_5__) \n", + " - [Assign predictions to the datasets](#toc5_6__) \n", + " - [Using VM Model and Dataset objects as arguments in Custom tests](#toc5_7__) \n", + " - [Log the test results](#toc5_8__) \n", + "- [Where to go from here](#toc6__) \n", + " - [Use cases](#toc6_1__) \n", + " - [More how-to guides and code samples](#toc6_2__) \n", + " - [Discover more learning resources](#toc6_3__) \n", + "- [Upgrade ValidMind](#toc7__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language.\n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "Here, we will focus on ValidMind dataset, ValidMind model and tests to use these objects to generate artefacts for the documentation.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + "- **model**: A single ValidMind model object that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + "- **dataset**: Single ValidMind dataset object that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Dataset based Test**\n", + "\n", + "![Dataset based test architecture](./dataset_image.png)\n", + "The dataset based tests take VM dataset object(s) as inputs, test configuration as test parameters to produce `Outputs` as mentioned above.\n", + "\n", + "**Model based Test**\n", + "\n", + "![Model based test architecture](./model_image.png)\n", + "Similar to datasest based tests, the model based tests as an additional input that is VM model object. It allows to identify prediction values of a specific model in the dataset object. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "Please note the following recommended Python versions to use:\n", + "\n", + "- Python 3.7 > x <= 3.11\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Binary classification`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import xgboost as xgb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Load the demo dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.datasets.classification import customer_churn as demo_dataset\n", + "\n", + "raw_df = demo_dataset.load_data()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Prepocess the raw dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df, validation_df, test_df = demo_dataset.preprocess(raw_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Train a model for testing\n", + "\n", + "We train a simple customer churn model for our test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_train = train_df.drop(demo_dataset.target_column, axis=1)\n", + "y_train = train_df[demo_dataset.target_column]\n", + "x_val = validation_df.drop(demo_dataset.target_column, axis=1)\n", + "y_val = validation_df[demo_dataset.target_column]\n", + "\n", + "model = xgb.XGBClassifier(early_stopping_rounds=10)\n", + "model.set_params(\n", + " eval_metric=[\"error\", \"logloss\", \"auc\"],\n", + ")\n", + "model.fit(\n", + " x_train,\n", + " y_train,\n", + " eval_set=[(x_val, y_val)],\n", + " verbose=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Explore basic components of the ValidMind library\n", + "\n", + "In this section, you will learn about the basic objects of the ValidMind library that are necessary to implement both custom and built-in tests. As explained above, these objects are:\n", + "* VMDataset: [The high level APIs can be found here](https://docs.validmind.ai/validmind/validmind/vm_models.html#VMDataset)\n", + "* VMModel: [The high level APIs can be found here](https://docs.validmind.ai/validmind/validmind/vm_models.html#VMModel)\n", + "\n", + "Let's understand these objects and their interfaces step by step: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### VMDataset Object" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Initialize the ValidMind datasets\n", + "\n", + "You can initialize a ValidMind dataset object using the [`init_dataset`](https://docs.validmind.ai/validmind/validmind.html#init_dataset) function from the ValidMind (`vm`) module.\n", + "\n", + "The function wraps the dataset to create a ValidMind `Dataset` object so that you can write tests effectively using the common interface provided by the VM objects. This step is always necessary every time you want to connect a dataset to documentation and produce test results through ValidMind. You only need to do it one time per dataset.\n", + "\n", + "This function takes a number of arguments. Some of the arguments are:\n", + "\n", + "- `dataset` — the raw dataset that you want to provide as input to tests\n", + "- `input_id` - a unique identifier that allows tracking what inputs are used when running each individual test\n", + "- `target_column` — a required argument if tests require access to true values. This is the name of the target column in the dataset\n", + "\n", + "The detailed list of the arguments can be found [here](https://docs.validmind.ai/validmind/validmind.html#init_dataset) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# vm_raw_dataset is now a VMDataset object that you can pass to any ValidMind test\n", + "vm_raw_dataset = vm.init_dataset(\n", + " dataset=raw_df,\n", + " input_id=\"raw_dataset\",\n", + " target_column=\"Exited\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have a ValidMind dataset object (VMDataset), you can inspect its attributes and methods using the inspect_obj utility module. This module provides a list of available attributes and interfaces for use in tests. Understanding how to use VMDatasets is crucial for comprehending how a custom test functions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.utils import inspect_obj\n", + "inspect_obj(vm_raw_dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Interfaces of the dataset object\n", + "\n", + "**DataFrame**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_raw_dataset.df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Feature columns**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_raw_dataset.feature_columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Target column**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_raw_dataset.target_column" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Features values**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_raw_dataset.x_df()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Target value**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_raw_dataset.y_df()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Numeric feature columns** " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_raw_dataset.feature_columns_numeric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Categorical feature columns** " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_raw_dataset.feature_columns_categorical" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similarly, you can use all other interfaces of the [VMDataset objects](https://docs.validmind.ai/validmind/validmind/vm_models.html#VMDataset) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Using VM Dataset object as arguments in custom tests\n", + "\n", + "A custom test is simply a Python function that takes two types of arguments: `inputs` and `params`. The `inputs` are ValidMind objects (`VMDataset`, `VMModel`), and the `params` are additional parameters required for the underlying computation of the test. We will discuss both types of arguments in the following sections.\n", + "\n", + "Let's start with a custom test that requires only a ValidMind dataset object. In this example, we will check the balance of classes in the target column of the dataset:\n", + "\n", + "- The custom test below requires a single argument of type `VMDataset` (dataset).\n", + "- The `my_custom_tests.ClassImbalance` is a unique test identifier that can be assigned using the `vm.test` decorator functionality. This unique test ID will be used in the platform to load test results in the documentation.\n", + "- The `dataset.target_column` and `dataset.df` attributes of the `VMDataset` object are used in the test.\n", + "\n", + "Other high-level APIs (attributes and methods) of the dataset object are listed [here](https://docs.validmind.ai/validmind/validmind/vm_models.html#VMDataset).\n", + "\n", + "If you've gone through the [Implement custom tests notebook](../tests/custom_tests/implement_custom_tests.ipynb), you should have a good understanding of how custom tests are implemented in details. If you haven't, we recommend going through that notebook first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.vm_models.dataset.dataset import VMDataset\n", + "import pandas as pd\n", + "\n", + "@vm.test(\"my_custom_tests.ClassImbalance\")\n", + "def class_imbalance(dataset):\n", + " # Can only run this test if we have a Dataset object\n", + " if not isinstance(dataset, VMDataset):\n", + " raise ValueError(\"ClassImbalance requires a validmind Dataset object\")\n", + "\n", + " if dataset.target_column is None:\n", + " print(\"Skipping class_imbalance test because no target column is defined\")\n", + " return\n", + "\n", + " # VMDataset object provides target_column attribute\n", + " target_column = dataset.target_column\n", + " # we can access pandas DataFrame using df attribute\n", + " imbalance_percentages = dataset.df[target_column].value_counts(\n", + " normalize=True\n", + " )\n", + " classes = list(imbalance_percentages.index) \n", + " percentages = list(imbalance_percentages.values * 100)\n", + "\n", + " return pd.DataFrame({\"Classes\":classes, \"Percentage\": percentages})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Run the test\n", + "\n", + "Let's run the test using the `run_test` method, which is part of the `validmind.tests` module. Here, we pass the `dataset` through the `inputs`. Similarly, you can pass `datasets`, `model`, or `models` as inputs if your custom test requires them. In this example below, we run the custom test `my_custom_tests.ClassImbalance` by passing the `dataset` through the `inputs`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import run_test\n", + "result = run_test(\n", + " test_id=\"my_custom_tests.ClassImbalance\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can move custom tests into separate modules in a folder. It allows you to take one-off tests and move them into an organized structure that makes it easier to manage, maintain and share them. We have provided a seperate notebook with detailed explaination [here](../tests/custom_tests/integrate_external_test_providers.ipynb) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Using VM Dataset object and parameters as arguments in custom tests\n", + "\n", + "Simlilar to `inputs`, you can pass `params` to a custom test by providing a dictionary of parameters to the `run_test()` function. The parameters will override any default parameters set in the custom test definition. Note that the `dataset` is still passed as `inputs`. \n", + "Let's modify the class imbalance test so that it provides flexibility to `normalize` the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.vm_models.dataset.dataset import VMDataset\n", + "import pandas as pd\n", + "\n", + "@vm.test(\"my_custom_tests.ClassImbalance\")\n", + "def class_imbalance(dataset, normalize=True):\n", + " # Can only run this test if we have a Dataset object\n", + " if not isinstance(dataset, VMDataset):\n", + " raise ValueError(\"ClassImbalance requires a validmind Dataset object\")\n", + "\n", + " if dataset.target_column is None:\n", + " print(\"Skipping class_imbalance test because no target column is defined\")\n", + " return\n", + "\n", + " # VMDataset object provides target_column attribute\n", + " target_column = dataset.target_column\n", + " # we can access pandas DataFrame using df attribute\n", + " imbalance_percentages = dataset.df[target_column].value_counts(\n", + " normalize=normalize\n", + " )\n", + " classes = list(imbalance_percentages.index) \n", + " if normalize: \n", + " result = pd.DataFrame({\"Classes\":classes, \"Percentage\": list(imbalance_percentages.values*100)})\n", + " else:\n", + " result = pd.DataFrame({\"Classes\":classes, \"Count\": list(imbalance_percentages.values)})\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, the `normalize` parameter is set to `False`, so the class counts will not be normalized. You can change the value to `True` if you want the counts to be normalized. The results of the test will reflect this flexibility, allowing for different outputs based on the parameter passed.\n", + "\n", + "Here, we have passed the `dataset` through the `inputs` and the `normalize` parameter using the `params`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import run_test\n", + "result = run_test(\n", + " test_id = \"my_custom_tests.ClassImbalance\",\n", + " inputs={\"dataset\": vm_raw_dataset},\n", + " params={\"normalize\": True},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### VMModel Object" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize ValidMind model object\n", + "\n", + "Similar to ValidMind `Dataset` object, you can initialize a ValidMind Model object using the [`init_model`](https://docs.validmind.ai/validmind/validmind.html#init_model) function from the ValidMind (`vm`) module.\n", + "\n", + "This function takes a number of arguments. Some of the arguments are:\n", + "\n", + "- `model` — the raw model that you want evaluate\n", + "- `input_id` - a unique identifier that allows tracking what inputs are used when running each individual test\n", + "\n", + "The detailed list of the arguments can be found [here](https://docs.validmind.ai/validmind/validmind.html#init_model) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "vm_model = vm.init_model(\n", + " model=model,\n", + " input_id=\"xgb_model\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's inspect the methods and attributes of the model now:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inspect_obj(vm_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign predictions to the datasets\n", + "\n", + "We can now use the `assign_predictions()` method from the `Dataset` object to link existing predictions to any model. If no prediction values are passed, the method will compute predictions automatically:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds = vm.init_dataset(\n", + " input_id=\"train_dataset\",\n", + " dataset=train_df,\n", + " type=\"generic\",\n", + " target_column=demo_dataset.target_column,\n", + ")\n", + "\n", + "vm_train_ds.assign_predictions(model=vm_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see below, the extra prediction column (`xgb_model_prediction`) for the model (`xgb_model`) has been added in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(vm_train_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Using VM Model and Dataset objects as arguments in Custom tests\n", + "\n", + "We will now create a `@vm.test` wrapper that will allow you to create a reusable test. Note the following changes in the code below:\n", + "\n", + "- The function `confusion_matrix` takes two arguments `dataset` and `model`. This is a `VMDataset` and `VMModel` object respectively.\n", + " - `VMDataset` objects allow you to access the dataset's true (target) values by accessing the `.y` attribute.\n", + " - `VMDataset` objects allow you to access the predictions for a given model by accessing the `.y_pred()` method.\n", + "- The function docstring provides a description of what the test does. This will be displayed along with the result in this notebook as well as in the ValidMind Platform.\n", + "- The function body calculates the confusion matrix using the `sklearn.tests.confusion_matrix` function as we just did above.\n", + "- The function then returns the `ConfusionMatrixDisplay.figure_` object - this is important as the ValidMind Library expects the output of the custom test to be a plot or a table.\n", + "- The `@vm.test` decorator is doing the work of creating a wrapper around the function that will allow it to be run by the ValidMind Library. It also registers the test so it can be found by the ID `my_custom_tests.ConfusionMatrix` (see the section below on how test IDs work in ValidMind and why this format is important)\n", + "\n", + "Similarly, you can use the functinality provided by `VMDataset` and `VMModel` objects. You can refer our documentation page for all the avalialble APIs [here](https://docs.validmind.ai/validmind/validmind.html#init_dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import metrics\n", + "import matplotlib.pyplot as plt\n", + "@vm.test(\"my_custom_tests.ConfusionMatrix\")\n", + "def confusion_matrix(dataset, model):\n", + " \"\"\"The confusion matrix is a table that is often used to describe the performance of a classification model on a set of data for which the true values are known.\n", + "\n", + " The confusion matrix is a 2x2 table that contains 4 values:\n", + "\n", + " - True Positive (TP): the number of correct positive predictions\n", + " - True Negative (TN): the number of correct negative predictions\n", + " - False Positive (FP): the number of incorrect positive predictions\n", + " - False Negative (FN): the number of incorrect negative predictions\n", + "\n", + " The confusion matrix can be used to assess the holistic performance of a classification model by showing the accuracy, precision, recall, and F1 score of the model on a single figure.\n", + " \"\"\"\n", + " # we can retrieve traget value from dataset which is y attribute\n", + " y_true = dataset.y\n", + " # The prediction value of a specific model using y_pred method \n", + " y_pred = dataset.y_pred(model=model)\n", + "\n", + " confusion_matrix = metrics.confusion_matrix(y_true, y_pred)\n", + "\n", + " cm_display = metrics.ConfusionMatrixDisplay(\n", + " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", + " )\n", + " cm_display.plot()\n", + " plt.close()\n", + "\n", + " return cm_display.figure_ # return the figure object itself" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we run test using two inputs; `dataset` and `model`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import run_test\n", + "result = run_test(\n", + " test_id = \"my_custom_tests.ConfusionMatrix\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " \"model\": vm_model,\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Log the test results\n", + "\n", + "You can log any test result to the ValidMind Platform with the `.log()` method of the result object. This will allow you to add the result to the documentation.\n", + "\n", + "You can now do the same for the confusion matrix results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Where to go from here\n", + "\n", + "In this notebook you have learned the end-to-end process to document a model with the ValidMind Library, running through some very common scenarios in a typical model development setting:\n", + "\n", + "- Running out-of-the-box tests\n", + "- Documenting your model by adding evidence to model documentation\n", + "- Extending the capabilities of the ValidMind Library by implementing custom tests\n", + "- Ensuring that the documentation is complete by running all tests in the documentation template\n", + "\n", + "As a next step, you can explore the following notebooks to get a deeper understanding on how the ValidMind Library allows you generate model documentation for any use case:\n", + "\n", + "\n", + "\n", + "### Use cases\n", + "\n", + "- [Document an application scorecard model](../../use_cases/credit_risk/application_scorecard_full_suite.ipynb)\n", + "- [Linear regression documentation demo](../../use_cases/regression/quickstart_regression_full_suite.ipynb)\n", + "- [LLM model documentation demo](../../use_cases/nlp_and_llm/foundation_models_integration_demo.ipynb)\n", + "\n", + "\n", + "\n", + "### More how-to guides and code samples\n", + "\n", + "- [Explore available tests in detail](../tests/explore_tests/explore_tests.ipynb)\n", + "- [In-depth guide for implementing custom tests](../tests/custom_tests/implement_custom_tests.ipynb)\n", + "- [In-depth guide to external test providers](../tests/custom_tests/integrate_external_test_providers.ipynb)\n", + "- [Configuring dataset features](./dataset_inputs/configure_dataset_features.ipynb)\n", + "- [Introduction to unit and composite tests](../metrics/run_unit_metrics.ipynb)\n", + "\n", + "\n", + "\n", + "### Discover more learning resources\n", + "\n", + "All notebook samples can be found in the following directories of the ValidMind Library GitHub repository:\n", + "\n", + "- [Use cases](https://github.com/validmind/validmind-library/tree/main/notebooks/use_cases)\n", + "- [How-to guides](https://github.com/validmind/validmind-library/tree/main/notebooks/how_to)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-340a990e20194848af0efb0c965e219a", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/images/add_metric_over_time_block.png b/site/notebooks/how_to/metrics/add_metric_over_time_block.png similarity index 100% rename from site/notebooks/images/add_metric_over_time_block.png rename to site/notebooks/how_to/metrics/add_metric_over_time_block.png diff --git a/site/notebooks/images/log_metric_accuracy.png b/site/notebooks/how_to/metrics/log_metric_accuracy.png similarity index 100% rename from site/notebooks/images/log_metric_accuracy.png rename to site/notebooks/how_to/metrics/log_metric_accuracy.png diff --git a/site/notebooks/images/log_metric_attention.png b/site/notebooks/how_to/metrics/log_metric_attention.png similarity index 100% rename from site/notebooks/images/log_metric_attention.png rename to site/notebooks/how_to/metrics/log_metric_attention.png diff --git a/site/notebooks/images/log_metric_auc_1.png b/site/notebooks/how_to/metrics/log_metric_auc_1.png similarity index 100% rename from site/notebooks/images/log_metric_auc_1.png rename to site/notebooks/how_to/metrics/log_metric_auc_1.png diff --git a/site/notebooks/images/log_metric_auc_2.png b/site/notebooks/how_to/metrics/log_metric_auc_2.png similarity index 100% rename from site/notebooks/images/log_metric_auc_2.png rename to site/notebooks/how_to/metrics/log_metric_auc_2.png diff --git a/site/notebooks/images/log_metric_auc_3.png b/site/notebooks/how_to/metrics/log_metric_auc_3.png similarity index 100% rename from site/notebooks/images/log_metric_auc_3.png rename to site/notebooks/how_to/metrics/log_metric_auc_3.png diff --git a/site/notebooks/images/log_metric_auc_4.png b/site/notebooks/how_to/metrics/log_metric_auc_4.png similarity index 100% rename from site/notebooks/images/log_metric_auc_4.png rename to site/notebooks/how_to/metrics/log_metric_auc_4.png diff --git a/site/notebooks/images/log_metric_f1.png b/site/notebooks/how_to/metrics/log_metric_f1.png similarity index 100% rename from site/notebooks/images/log_metric_f1.png rename to site/notebooks/how_to/metrics/log_metric_f1.png diff --git a/site/notebooks/images/log_metric_precision.png b/site/notebooks/how_to/metrics/log_metric_precision.png similarity index 100% rename from site/notebooks/images/log_metric_precision.png rename to site/notebooks/how_to/metrics/log_metric_precision.png diff --git a/site/notebooks/images/log_metric_recall.png b/site/notebooks/how_to/metrics/log_metric_recall.png similarity index 100% rename from site/notebooks/images/log_metric_recall.png rename to site/notebooks/how_to/metrics/log_metric_recall.png diff --git a/site/notebooks/images/log_metric_satisfactory.png b/site/notebooks/how_to/metrics/log_metric_satisfactory.png similarity index 100% rename from site/notebooks/images/log_metric_satisfactory.png rename to site/notebooks/how_to/metrics/log_metric_satisfactory.png diff --git a/site/notebooks/images/log_metric_satisfactory_2.png b/site/notebooks/how_to/metrics/log_metric_satisfactory_2.png similarity index 100% rename from site/notebooks/images/log_metric_satisfactory_2.png rename to site/notebooks/how_to/metrics/log_metric_satisfactory_2.png diff --git a/site/notebooks/how_to/log_metrics_over_time.ipynb b/site/notebooks/how_to/metrics/log_metrics_over_time.ipynb similarity index 97% rename from site/notebooks/how_to/log_metrics_over_time.ipynb rename to site/notebooks/how_to/metrics/log_metrics_over_time.ipynb index f3bbfbdfa7..4b5c66dc2a 100644 --- a/site/notebooks/how_to/log_metrics_over_time.ipynb +++ b/site/notebooks/how_to/metrics/log_metrics_over_time.ipynb @@ -108,7 +108,7 @@ "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -173,8 +173,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Credit Risk - CECL`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." @@ -547,8 +545,8 @@ "- In this example, since we've only logged a single data point, the visualization shows just one measurement.\n", "- As you continue logging metrics, the graph will populate with more points, enabling you to track trends and patterns.\n", "\n", - "![Metric Over Time block](../images/add_metric_over_time_block.png)\n", - "![AUC Score](../images/log_metric_auc_1.png)" + "![Metric Over Time block](./add_metric_over_time_block.png)\n", + "![AUC Score](./log_metric_auc_1.png)" ] }, { @@ -591,7 +589,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![AUC Score](../images/log_metric_auc_2.png)" + "![AUC Score](./log_metric_auc_2.png)" ] }, { @@ -616,7 +614,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![AUC Score](../images/log_metric_auc_3.png)" + "![AUC Score](./log_metric_auc_3.png)" ] }, { @@ -719,11 +717,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![AUC Score](../images/log_metric_auc_4.png)\n", - "![Accuracy Score](../images/log_metric_accuracy.png)\n", - "![Precision Score](../images/log_metric_precision.png)\n", - "![Recall Score](../images/log_metric_recall.png)\n", - "![F1 Score](../images/log_metric_f1.png)" + "![AUC Score](./log_metric_auc_4.png)\n", + "![Accuracy Score](./log_metric_accuracy.png)\n", + "![Precision Score](./log_metric_precision.png)\n", + "![Recall Score](./log_metric_recall.png)\n", + "![F1 Score](./log_metric_f1.png)" ] }, { @@ -769,7 +767,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![GINI Score](../images/log_metric_satisfactory.png)" + "![GINI Score](./log_metric_satisfactory.png)" ] }, { @@ -802,7 +800,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![GINI Score](../images/log_metric_attention.png)" + "![GINI Score](./log_metric_attention.png)" ] }, { @@ -842,7 +840,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![GINI Score](../images/log_metric_satisfactory_2.png)" + "![GINI Score](./log_metric_satisfactory_2.png)" ] }, { diff --git a/site/notebooks/how_to/run_unit_metrics.ipynb b/site/notebooks/how_to/metrics/run_unit_metrics.ipynb similarity index 99% rename from site/notebooks/how_to/run_unit_metrics.ipynb rename to site/notebooks/how_to/metrics/run_unit_metrics.ipynb index 235c538a98..d9ffe11759 100644 --- a/site/notebooks/how_to/run_unit_metrics.ipynb +++ b/site/notebooks/how_to/metrics/run_unit_metrics.ipynb @@ -166,8 +166,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/how_to/assign_scores_complete_tutorial.ipynb b/site/notebooks/how_to/scoring/assign_scores_complete_tutorial.ipynb similarity index 99% rename from site/notebooks/how_to/assign_scores_complete_tutorial.ipynb rename to site/notebooks/how_to/scoring/assign_scores_complete_tutorial.ipynb index 998a78ace7..933da518ac 100644 --- a/site/notebooks/how_to/assign_scores_complete_tutorial.ipynb +++ b/site/notebooks/how_to/scoring/assign_scores_complete_tutorial.ipynb @@ -168,8 +168,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Analytics`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/images/external-data-custom-test.png b/site/notebooks/how_to/tests/custom_tests/external-data-custom-test.png similarity index 100% rename from site/notebooks/images/external-data-custom-test.png rename to site/notebooks/how_to/tests/custom_tests/external-data-custom-test.png diff --git a/site/notebooks/images/hyperparameters-custom-metric.png b/site/notebooks/how_to/tests/custom_tests/hyperparameters-custom-metric.png similarity index 100% rename from site/notebooks/images/hyperparameters-custom-metric.png rename to site/notebooks/how_to/tests/custom_tests/hyperparameters-custom-metric.png diff --git a/site/notebooks/images/image-in-custom-metric.png b/site/notebooks/how_to/tests/custom_tests/image-in-custom-metric.png similarity index 100% rename from site/notebooks/images/image-in-custom-metric.png rename to site/notebooks/how_to/tests/custom_tests/image-in-custom-metric.png diff --git a/site/notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb b/site/notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb new file mode 100644 index 0000000000..638033b4e5 --- /dev/null +++ b/site/notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb @@ -0,0 +1,1093 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Implement custom tests\n", + "\n", + "Custom tests extend the functionality of ValidMind, allowing you to document any model or use case with added flexibility.\n", + "\n", + "ValidMind provides a comprehensive set of tests out-of-the-box to evaluate and document your models and datasets. We recognize there will be cases where the default tests do not support a model or dataset, or specific documentation is needed. In these cases, you can create and use your own custom code to accomplish what you need. To streamline custom code integration, we support the creation of custom test functions.\n", + "\n", + "This interactive notebook provides a step-by-step guide for implementing and registering custom tests with ValidMind, running them individually, viewing the results on the ValidMind Platform, and incorporating them into your model documentation template." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1__) \n", + " - [Before you begin](#toc1_1__) \n", + " - [New to ValidMind?](#toc1_2__) \n", + " - [Key concepts](#toc1_3__) \n", + "- [Setting up](#toc2__) \n", + " - [Install the ValidMind Library](#toc2_1__) \n", + " - [Initialize the ValidMind Library](#toc2_2__) \n", + " - [Register sample model](#toc2_2_1__) \n", + " - [Apply documentation template](#toc2_2_2__) \n", + " - [Get your code snippet](#toc2_2_3__) \n", + "- [Implement a Custom Test](#toc3__) \n", + "- [Run the Custom Test](#toc4__) \n", + " - [Setup the Model and Dataset](#toc4_1__) \n", + " - [Run the Custom Test](#toc4_2__) \n", + "- [Adding Custom Test to Model Documentation](#toc5__) \n", + "- [Some More Custom Tests](#toc6__) \n", + " - [Custom Test: Table of Model Hyperparameters](#toc6_1__) \n", + " - [Custom Test: External API Call](#toc6_2__) \n", + " - [Custom Test: Passing Parameters](#toc6_3__) \n", + " - [Custom Test: Multiple Tables and Plots in a Single Test](#toc6_4__) \n", + " - [Custom Test: Images](#toc6_5__) \n", + " - [Custom Test: Description](#toc6_6__) \n", + "- [Conclusion](#toc7__) \n", + "- [Next steps](#toc8__) \n", + " - [Work with your model documentation](#toc8_1__) \n", + " - [Discover more learning resources](#toc8_2__) \n", + "- [Upgrade ValidMind](#toc9__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom test can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Binary classification`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Implement a Custom Test\n", + "\n", + "Let's start off by creating a simple custom test that creates a Confusion Matrix for a binary classification model. We will use the `sklearn.metrics.confusion_matrix` function to calculate the confusion matrix and then display it as a heatmap using `plotly`. (This is already a built-in test in ValidMind, but we will use it as an example to demonstrate how to create custom tests.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn import metrics\n", + "\n", + "\n", + "@vm.test(\"my_custom_tests.ConfusionMatrix\")\n", + "def confusion_matrix(dataset, model):\n", + " \"\"\"The confusion matrix is a table that is often used to describe the performance of a classification model on a set of data for which the true values are known.\n", + "\n", + " The confusion matrix is a 2x2 table that contains 4 values:\n", + "\n", + " - True Positive (TP): the number of correct positive predictions\n", + " - True Negative (TN): the number of correct negative predictions\n", + " - False Positive (FP): the number of incorrect positive predictions\n", + " - False Negative (FN): the number of incorrect negative predictions\n", + "\n", + " The confusion matrix can be used to assess the holistic performance of a classification model by showing the accuracy, precision, recall, and F1 score of the model on a single figure.\n", + " \"\"\"\n", + " y_true = dataset.y\n", + " y_pred = dataset.y_pred(model)\n", + "\n", + " confusion_matrix = metrics.confusion_matrix(y_true, y_pred)\n", + "\n", + " cm_display = metrics.ConfusionMatrixDisplay(\n", + " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", + " )\n", + " cm_display.plot()\n", + "\n", + " plt.close() # close the plot to avoid displaying it\n", + "\n", + " return cm_display.figure_ # return the figure object itself" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thats our custom test defined and ready to go... Let's take a look at whats going on here:\n", + "\n", + "- The function `confusion_matrix` takes two arguments `dataset` and `model`. This is a VMDataset and VMModel object respectively.\n", + "- The function docstring provides a description of what the test does. This will be displayed along with the result in this notebook as well as in the ValidMind Platform.\n", + "- The function body calculates the confusion matrix using the `sklearn.metrics.confusion_matrix` function and then plots it using `sklearn.metric.ConfusionMatrixDisplay`.\n", + "- The function then returns the `ConfusionMatrixDisplay.figure_` object - this is important as the ValidMind Library expects the output of the custom test to be a plot or a table.\n", + "- The `@vm.test` decorator is doing the work of creating a wrapper around the function that will allow it to be run by the ValidMind Library. It also registers the test so it can be found by the ID `my_custom_tests.ConfusionMatrix` (see the section below on how test IDs work in ValidMind and why this format is important)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Run the Custom Test\n", + "\n", + "Now that we have defined and registered our custom test, lets see how we can run it and properly use it in the ValidMind Platform." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Setup the Model and Dataset\n", + "\n", + "First let's setup a an example model and dataset to run our custom metic against. Since this is a Confusion Matrix, we will use the Customer Churn dataset that ValidMind provides and train a simple XGBoost model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "from validmind.datasets.classification import customer_churn\n", + "\n", + "raw_df = customer_churn.load_data()\n", + "train_df, validation_df, test_df = customer_churn.preprocess(raw_df)\n", + "\n", + "x_train = train_df.drop(customer_churn.target_column, axis=1)\n", + "y_train = train_df[customer_churn.target_column]\n", + "x_val = validation_df.drop(customer_churn.target_column, axis=1)\n", + "y_val = validation_df[customer_churn.target_column]\n", + "\n", + "model = xgb.XGBClassifier(early_stopping_rounds=10)\n", + "model.set_params(\n", + " eval_metric=[\"error\", \"logloss\", \"auc\"],\n", + ")\n", + "model.fit(\n", + " x_train,\n", + " y_train,\n", + " eval_set=[(x_val, y_val)],\n", + " verbose=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Easy enough! Now we have a model and dataset setup and trained. One last thing to do is bring the dataset and model into the ValidMind Library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for now, we'll just use the test dataset\n", + "vm_test_ds = vm.init_dataset(\n", + " dataset=test_df,\n", + " target_column=customer_churn.target_column,\n", + " input_id=\"test_dataset\",\n", + ")\n", + "\n", + "vm_model = vm.init_model(model, input_id=\"model\")\n", + "\n", + "# link the model to the dataset\n", + "vm_test_ds.assign_predictions(model=vm_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run the Custom Test\n", + "\n", + "Now that we have our model and dataset setup, we have everything we need to run our custom test. We can do this by importing the `run_test` function from the `validmind.tests` module and passing in the test ID of our custom test along with the model and dataset we want to run it against.\n", + "\n", + ">Notice how the `inputs` dictionary is used to map an `input_id` which we set above to the `model` and `dataset` keys that are expected by our custom test function. This is how the ValidMind Library knows which inputs to pass to different tests and is key when using many different datasets and models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import run_test\n", + "\n", + "result = run_test(\n", + " \"my_custom_tests.ConfusionMatrix\",\n", + " inputs={\"model\": \"model\", \"dataset\": \"test_dataset\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You'll notice that the docstring becomes a markdown description of the test. The figure is then displayed as the test result. What you see above is how it will look in the ValidMind Platform as well. Let's go ahead and log the result to see how that works." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Adding Custom Test to Model Documentation\n", + "\n", + "To do this, go to the documentation page of the model you registered above and navigate to the `Model Development` -> `Model Evaluation` section. Then hover between any existing content block to reveal the `+` button as shown in the screenshot below.\n", + "\n", + "![screenshot showing insert button for test-driven blocks](./insert-test-driven-block.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now click on the `+` button and select the `Test-Driven Block` option. This will open a dialog where you can select `My Custom Tests Confusion Matrix` from the list of available tests. You can preview the result and then click `Insert Block` to add it to the documentation.\n", + "\n", + "![screenshot showing how to insert a test-driven block](./insert-test-driven-block-custom.png)\n", + "\n", + "The test should match the result you see above. It is now part of your documentation and will now be run everytime you run `vm.run_documentation_tests()` for your model. Let's do that now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.reload()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you preview the template, it should show the custom test in the `Model Development`->`Model Evaluation` section:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just so we can run all of the tests in the template, let's initialize the train and raw dataset.\n", + "\n", + "(Refer to [**Quickstart for model documentation**](../../../quickstart/quickstart_model_documentation.ipynb) and the ValidMind docs for more information on what we are doing here)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_raw_dataset = vm.init_dataset(\n", + " dataset=raw_df,\n", + " input_id=\"raw_dataset\",\n", + " target_column=customer_churn.target_column,\n", + " class_labels=customer_churn.class_labels,\n", + ")\n", + "\n", + "vm_train_ds = vm.init_dataset(\n", + " dataset=train_df,\n", + " input_id=\"train_dataset\",\n", + " target_column=customer_churn.target_column,\n", + ")\n", + "vm_train_ds.assign_predictions(model=vm_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To run all the tests in the template, you can use the `vm.run_documentation_tests()` and pass the inputs we initialized above and the demo config from our customer_churn module. We will have to add a section to the config for our new test to tell it which inputs it should receive. This is done by simply adding a new element in the config dictionary where the key is the ID of the test and the value is a dictionary with the following structure:\n", + "```python\n", + "{\n", + " \"inputs\": {\n", + " \"model\": \"test_dataset\",\n", + " \"dataset\": \"model\",\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.utils import preview_test_config\n", + "\n", + "test_config = customer_churn.get_demo_test_config()\n", + "test_config[\"my_custom_tests.ConfusionMatrix\"] = {\n", + " \"inputs\": {\n", + " \"dataset\": \"test_dataset\",\n", + " \"model\": \"model\",\n", + " }\n", + "}\n", + "preview_test_config(test_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "full_suite = vm.run_documentation_tests(config=test_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Some More Custom Tests\n", + "\n", + "Now that you understand the entire process of creating custom tests and using them in your documentation, let's create a few more to see different ways you can utilize custom tests." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Custom Test: Table of Model Hyperparameters\n", + "\n", + "This custom test will display a table of the hyperparameters used in the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.Hyperparameters\")\n", + "def hyperparameters(model):\n", + " \"\"\"The hyperparameters of a machine learning model are the settings that control the learning process.\n", + " These settings are specified before the learning process begins and can have a significant impact on the\n", + " performance of the model.\n", + "\n", + " The hyperparameters of a model can be used to tune the model to achieve the best possible performance\n", + " on a given dataset. By examining the hyperparameters of a model, you can gain insight into how the model\n", + " was trained and how it might be improved.\n", + " \"\"\"\n", + " hyperparameters = model.model.get_xgb_params() # dictionary of hyperparameters\n", + "\n", + " # turn the dictionary into a table where each row contains a hyperparameter and its value\n", + " return [{\"Hyperparam\": k, \"Value\": v} for k, v in hyperparameters.items() if v]\n", + "\n", + "\n", + "result = run_test(\"my_custom_tests.Hyperparameters\", inputs={\"model\": \"model\"})\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the test has been run and logged, you can add it to your documentation using the same process as above. It should look like this:\n", + "\n", + "![screenshot showing hyperparameters test](./hyperparameters-custom-metric.png)\n", + "\n", + "For our simple toy model, there are aren't really any proper hyperparameters but you can see how this could be useful for more complex models that have gone through hyperparameter tuning." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Custom Test: External API Call\n", + "\n", + "This custom test will make an external API call to get the current BTC price and display it as a table. This demonstrates how you might integrate external data sources into your model documentation in a programmatic way. You could, for instance, setup a pipeline that runs a test like this every day to keep your model documentation in sync with an external system." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import random\n", + "\n", + "\n", + "@vm.test(\"my_custom_tests.ExternalAPI\")\n", + "def external_api():\n", + " \"\"\"This test calls an external API to get a list of fake users. It then creates\n", + " a table with the relevant data so it can be displayed in the documentation.\n", + "\n", + " The purpose of this test is to demonstrate how to call an external API and use the\n", + " data in a test. A test like this could even be setup to run in a scheduled\n", + " pipeline to keep your documentation in-sync with an external data source.\n", + " \"\"\"\n", + " url = \"https://jsonplaceholder.typicode.com/users\"\n", + " response = requests.get(url)\n", + " data = response.json()\n", + "\n", + " # extract the time and the current BTC price in USD\n", + " return {\n", + " \"Model Owners/Stakeholders\": [\n", + " {\n", + " \"Name\": user[\"name\"],\n", + " \"Role\": random.choice([\"Owner\", \"Stakeholder\"]),\n", + " \"Email\": user[\"email\"],\n", + " \"Phone\": user[\"phone\"],\n", + " \"Slack Handle\": f\"@{user['name'].lower().replace(' ', '.')}\",\n", + " }\n", + " for user in data[:3]\n", + " ],\n", + " \"Model Developers\": [\n", + " {\n", + " \"Name\": user[\"name\"],\n", + " \"Role\": \"Developer\",\n", + " \"Email\": user[\"email\"],\n", + " }\n", + " for user in data[3:7]\n", + " ],\n", + " \"Model Validators\": [\n", + " {\n", + " \"Name\": user[\"name\"],\n", + " \"Role\": \"Validator\",\n", + " \"Email\": user[\"email\"],\n", + " }\n", + " for user in data[7:]\n", + " ],\n", + " }\n", + "\n", + "\n", + "result = run_test(\"my_custom_tests.ExternalAPI\")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, you can add this to your documentation to see how it looks:\n", + "\n", + "![screenshot showing BTC price metric](./external-data-custom-test.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Custom Test: Passing Parameters\n", + "\n", + "Custom test functions, as stated earlier, can take both inputs and params. When you define your function there is no need to distinguish between the two, the ValidMind Library will handle that for you. You simply need to add both to the function as arguments and the library will pass in the correct values.\n", + "\n", + "So for instance, if you wanted to parameterize the first custom test we created, the confusion matrix, you could do so like this:\n", + "\n", + "```python\n", + "def confusion_matrix(dataset: VMDataset, model: VMModel, my_param: str = \"Default Value\"):\n", + " pass\n", + "```\n", + "\n", + "And then when you run the test, you can pass in the parameter like this:\n", + "\n", + "```python\n", + "vm.run_test(\n", + " \"my_custom_tests.ConfusionMatrix\",\n", + " inputs={\"model\": \"model\", \"dataset\": \"test_dataset\"},\n", + " params={\"my_param\": \"My Value\"},\n", + ")\n", + "```\n", + "\n", + "Or if you are running the entire documentation template, you would update the config like this:\n", + "\n", + "```python\n", + "test_config[\"my_custom_tests.ConfusionMatrix\"] = {\n", + " \"inputs\": {\n", + " \"dataset\": \"test_dataset\",\n", + " \"model\": \"model\",\n", + " },\n", + " \"params\": {\n", + " \"my_param\": \"My Value\",\n", + " },\n", + "}\n", + "```\n", + "\n", + "Let's go ahead and create a toy test that takes a parameter and uses it in the result:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.express as px\n", + "\n", + "\n", + "@vm.test(\"my_custom_tests.ParameterExample\")\n", + "def parameter_example(\n", + " plot_title=\"Default Plot Title\", x_col=\"sepal_width\", y_col=\"sepal_length\"\n", + "):\n", + " \"\"\"This test takes two parameters and creates a scatter plot based on them.\n", + "\n", + " The purpose of this test is to demonstrate how to create a test that takes\n", + " parameters and uses them to generate a plot. This can be useful for creating\n", + " tests that are more flexible and can be used in a variety of scenarios.\n", + " \"\"\"\n", + " # return px.scatter(px.data.iris(), x=x_col, y=y_col, color=\"species\")\n", + " return px.scatter(\n", + " px.data.iris(), x=x_col, y=y_col, color=\"species\", title=plot_title\n", + " )\n", + "\n", + "\n", + "result = run_test(\n", + " \"my_custom_tests.ParameterExample\",\n", + " params={\n", + " \"plot_title\": \"My Cool Plot\",\n", + " \"x_col\": \"sepal_width\",\n", + " \"y_col\": \"sepal_length\",\n", + " },\n", + ")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Play around with this and see how you can use parameters, default values and other features to make your custom tests more flexible and useful.\n", + "\n", + "Here's how this one looks in the documentation:\n", + "![screenshot showing parameterized test](./parameterized-custom-metric.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Custom Test: Multiple Tables and Plots in a Single Test\n", + "\n", + "Custom test functions, as stated earlier, can return more than just one table or plot. In fact, any number of tables and plots can be returned. Let's see an example of this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import plotly.express as px\n", + "\n", + "\n", + "@vm.test(\"my_custom_tests.ComplexOutput\")\n", + "def complex_output():\n", + " \"\"\"This test demonstrates how to return many tables and figures in a single test\"\"\"\n", + " # create a couple tables\n", + " table = [{\"A\": 1, \"B\": 2}, {\"A\": 3, \"B\": 4}]\n", + " table2 = [{\"C\": 5, \"D\": 6}, {\"C\": 7, \"D\": 8}]\n", + "\n", + " # create a few figures showing some random data\n", + " fig1 = px.line(x=np.arange(10), y=np.random.rand(10), title=\"Random Line Plot\")\n", + " fig2 = px.bar(x=[\"A\", \"B\", \"C\"], y=np.random.rand(3), title=\"Random Bar Plot\")\n", + " fig3 = px.scatter(\n", + " x=np.random.rand(10), y=np.random.rand(10), title=\"Random Scatter Plot\"\n", + " )\n", + "\n", + " return (\n", + " {\n", + " \"My Cool Table\": table,\n", + " \"Another Table\": table2,\n", + " },\n", + " fig1,\n", + " fig2,\n", + " fig3,\n", + " )\n", + "\n", + "\n", + "result = run_test(\"my_custom_tests.ComplexOutput\")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice how you can return the tables as a dictionary where the key is the title of the table and the value is the table itself. You could also just return the tables by themselves but this way you can give them a title to more easily identify them in the result.\n", + "\n", + "![screenshot showing multiple tables and plots](./multiple-tables-plots-custom-metric.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Custom Test: Images\n", + "\n", + "If you are using a plotting library that isn't supported by ValidMind (i.e. not `matplotlib` or `plotly`), you can still return the image directly as a bytes-like object. This could also be used to bring any type of image into your documentation in a programmatic way. For instance, you may want to include a diagram of your model architecture or a screenshot of a dashboard that your model is integrated with. As long as you can produce the image with Python or open it from a file, you can include it in your documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import io\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "@vm.test(\"my_custom_tests.Image\")\n", + "def image():\n", + " \"\"\"This test demonstrates how to return an image in a test\"\"\"\n", + "\n", + " # create a simple plot\n", + " fig, ax = plt.subplots()\n", + " ax.plot([1, 2, 3, 4])\n", + " ax.set_title(\"Simple Line Plot\")\n", + "\n", + " # save the plot as a PNG image (in-memory buffer)\n", + " img_data = io.BytesIO()\n", + " fig.savefig(img_data, format=\"png\")\n", + " img_data.seek(0)\n", + "\n", + " plt.close() # close the plot to avoid displaying it\n", + "\n", + " return img_data.read()\n", + "\n", + "\n", + "result = run_test(\"my_custom_tests.Image\")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding this custom test to your documentation will display the image:\n", + "\n", + "![screenshot showing image custom test](./image-in-custom-metric.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to log an image as a test result, you can do so by passing the path to the image as a parameter to the custom test and then opening the file in the test function. Here's an example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.MyPNGCorrelationMatrix\")\n", + "def Image(path: str):\n", + " \"\"\"Opens a png image file and logs it as a test result to ValidMind\"\"\"\n", + " if not path.endswith(\".png\"):\n", + " raise ValueError(\"Image must be a PNG file\")\n", + "\n", + " # return raw image bytes\n", + " with open(path, \"rb\") as f:\n", + " return f.read()\n", + " \n", + "run_test(\n", + " \"my_custom_tests.MyPNGCorrelationMatrix\",\n", + " params={\"path\": \"./pearson-correlation-matrix.png\"},\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The image is displayed in the test result:\n", + "\n", + "![screenshot showing image from file](./pearson-correlation-matrix-test-output.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Custom Test: Description\n", + "\n", + "If you want to write a custom test description for your custom test instead of it is interpreted through llm, you can do so by returning string in your test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "@vm.test(\"my_custom_tests.MyCustomTest\")\n", + "def my_custom_test(dataset, model):\n", + " \"\"\"\n", + " This is a custom computed test that computes confusion matrix for a binary classification model and return a string as a test description.\n", + " \"\"\"\n", + " y_true = dataset.y\n", + " y_pred = dataset.y_pred(model)\n", + "\n", + " confusion_matrix = metrics.confusion_matrix(y_true, y_pred)\n", + "\n", + " cm_display = metrics.ConfusionMatrixDisplay(\n", + " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", + " )\n", + " cm_display.plot()\n", + "\n", + " plt.close() # close the plot to avoid displaying it\n", + "\n", + " return cm_display.figure_, \"Test Description - Confusion Matrix\", pd.DataFrame({\"Value\": [1, 2, 3]}) # return the figure object itself\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see here test result description has been customized here. The same result description will be displayed in the UI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.MyCustomTest\",\n", + " inputs={\"model\": \"model\", \"dataset\": \"test_dataset\"},\n", + ")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Conclusion\n", + "\n", + "In this notebook, we have demonstrated how to create custom tests in ValidMind. We have shown how to define custom test functions, register them with the ValidMind Library, run them against models and datasets, and add them to model documentation templates. We have also shown how to return tables and plots from custom tests and how to use them in the ValidMind Platform. We hope this tutorial has been helpful in understanding how to create and use custom tests in ValidMind." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps\n", + "\n", + "You can look at the results of this test suite right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation.\n", + "\n", + "\n", + "\n", + "### Work with your model documentation\n", + "\n", + "1. From the **Model Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", + "\n", + "2. Click and expand the **Model Development** section.\n", + "\n", + "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/model-documentation/working-with-model-documentation.html)\n", + "\n", + "\n", + "\n", + "### Discover more learning resources\n", + "\n", + "We offer many interactive notebooks to help you document models:\n", + "\n", + "- [Run tests & test suites](https://docs.validmind.ai/developer/model-testing/testing-overview.html)\n", + "- [Code samples](https://docs.validmind.ai/developer/samples-jupyter-notebooks.html)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-997b933948594ddd929ee9419957dfe3", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/site/notebooks/images/insert-test-driven-block-custom.png b/site/notebooks/how_to/tests/custom_tests/insert-test-driven-block-custom.png similarity index 100% rename from site/notebooks/images/insert-test-driven-block-custom.png rename to site/notebooks/how_to/tests/custom_tests/insert-test-driven-block-custom.png diff --git a/site/notebooks/images/insert-test-driven-block-test-provider.png b/site/notebooks/how_to/tests/custom_tests/insert-test-driven-block-test-provider.png similarity index 100% rename from site/notebooks/images/insert-test-driven-block-test-provider.png rename to site/notebooks/how_to/tests/custom_tests/insert-test-driven-block-test-provider.png diff --git a/site/notebooks/images/insert-test-driven-block.png b/site/notebooks/how_to/tests/custom_tests/insert-test-driven-block.png similarity index 100% rename from site/notebooks/images/insert-test-driven-block.png rename to site/notebooks/how_to/tests/custom_tests/insert-test-driven-block.png diff --git a/site/notebooks/code_samples/custom_tests/integrate_external_test_providers.ipynb b/site/notebooks/how_to/tests/custom_tests/integrate_external_test_providers.ipynb similarity index 99% rename from site/notebooks/code_samples/custom_tests/integrate_external_test_providers.ipynb rename to site/notebooks/how_to/tests/custom_tests/integrate_external_test_providers.ipynb index ca1403fb6c..fd84b39ca6 100644 --- a/site/notebooks/code_samples/custom_tests/integrate_external_test_providers.ipynb +++ b/site/notebooks/how_to/tests/custom_tests/integrate_external_test_providers.ipynb @@ -183,8 +183,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." @@ -758,7 +756,7 @@ "\n", "Now that the result has been logged to the ValidMind Platform, you can add it to your model documentation. This will add the result where you specify but it also will add the test to the template so it gets run anytime you `run_documentation_tests()`. To do this, go to the documentation page of the model you connected to above and navigate to the `Model Development` -> `Model Evaluation` section. Then hover between any existing content block to reveal the `+` button as shown in the screenshot below.\n", "\n", - "![screenshot showing insert button for test-driven blocks](../../images/insert-test-driven-block.png)" + "![screenshot showing insert button for test-driven blocks](./insert-test-driven-block.png)" ] }, { @@ -767,7 +765,7 @@ "source": [ "Now click on the `+` button and select the `Test-Driven Block` option. This will open a dialog where you can select `My Test Provider Confusion Matrix` from the list of available tests. You can preview the result and then click `Insert Block` to add it to the documentation.\n", "\n", - "![screenshot showing how to insert a test-driven block](../../images/insert-test-driven-block-test-provider.png)\n", + "![screenshot showing how to insert a test-driven block](./insert-test-driven-block-test-provider.png)\n", "\n", "The test should match the result you see above." ] diff --git a/site/notebooks/images/multiple-tables-plots-custom-metric.png b/site/notebooks/how_to/tests/custom_tests/multiple-tables-plots-custom-metric.png similarity index 100% rename from site/notebooks/images/multiple-tables-plots-custom-metric.png rename to site/notebooks/how_to/tests/custom_tests/multiple-tables-plots-custom-metric.png diff --git a/site/notebooks/images/parameterized-custom-metric.png b/site/notebooks/how_to/tests/custom_tests/parameterized-custom-metric.png similarity index 100% rename from site/notebooks/images/parameterized-custom-metric.png rename to site/notebooks/how_to/tests/custom_tests/parameterized-custom-metric.png diff --git a/site/notebooks/images/pearson-correlation-matrix-test-output.png b/site/notebooks/how_to/tests/custom_tests/pearson-correlation-matrix-test-output.png similarity index 100% rename from site/notebooks/images/pearson-correlation-matrix-test-output.png rename to site/notebooks/how_to/tests/custom_tests/pearson-correlation-matrix-test-output.png diff --git a/site/notebooks/images/pearson-correlation-matrix.png b/site/notebooks/how_to/tests/custom_tests/pearson-correlation-matrix.png similarity index 100% rename from site/notebooks/images/pearson-correlation-matrix.png rename to site/notebooks/how_to/tests/custom_tests/pearson-correlation-matrix.png diff --git a/site/notebooks/how_to/explore_test_suites.ipynb b/site/notebooks/how_to/tests/explore_tests/explore_test_suites.ipynb similarity index 99% rename from site/notebooks/how_to/explore_test_suites.ipynb rename to site/notebooks/how_to/tests/explore_tests/explore_test_suites.ipynb index f9f0a72861..a14d064637 100644 --- a/site/notebooks/how_to/explore_test_suites.ipynb +++ b/site/notebooks/how_to/tests/explore_tests/explore_test_suites.ipynb @@ -85,7 +85,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -672,7 +672,7 @@ "\n", "
Learn more about the individual tests available in the ValidMind Library\n", "

\n", - "Check out our Explore tests notebook for more code examples and usage of key functions.
\n", + "Check out our Explore tests notebook for more code examples and usage of key functions.\n", "\n", "\n", "\n", diff --git a/site/notebooks/how_to/explore_test_suites_output.png b/site/notebooks/how_to/tests/explore_tests/explore_test_suites_output.png similarity index 100% rename from site/notebooks/how_to/explore_test_suites_output.png rename to site/notebooks/how_to/tests/explore_tests/explore_test_suites_output.png diff --git a/site/notebooks/how_to/explore_tests.ipynb b/site/notebooks/how_to/tests/explore_tests/explore_tests.ipynb similarity index 99% rename from site/notebooks/how_to/explore_tests.ipynb rename to site/notebooks/how_to/tests/explore_tests/explore_tests.ipynb index 9fc1d3292e..ef5680e8fd 100644 --- a/site/notebooks/how_to/explore_tests.ipynb +++ b/site/notebooks/how_to/tests/explore_tests/explore_tests.ipynb @@ -86,7 +86,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -4371,7 +4371,7 @@ "\n", "
Learn about the tests suites available in the ValidMind Library.\n", "

\n", - "Check out our Explore test suites notebook for more code examples and usage of key functions.
\n", + "Check out our Explore test suites notebook for more code examples and usage of key functions.\n", "\n", "\n", "\n", diff --git a/site/notebooks/how_to/test_suite_output1.png b/site/notebooks/how_to/tests/explore_tests/test_suite_output1.png similarity index 100% rename from site/notebooks/how_to/test_suite_output1.png rename to site/notebooks/how_to/tests/explore_tests/test_suite_output1.png diff --git a/site/notebooks/how_to/test_suite_output2.png b/site/notebooks/how_to/tests/explore_tests/test_suite_output2.png similarity index 100% rename from site/notebooks/how_to/test_suite_output2.png rename to site/notebooks/how_to/tests/explore_tests/test_suite_output2.png diff --git a/site/notebooks/how_to/run_tests/1_run_dataset_based_tests.ipynb b/site/notebooks/how_to/tests/run_tests/1_run_dataset_based_tests.ipynb similarity index 97% rename from site/notebooks/how_to/run_tests/1_run_dataset_based_tests.ipynb rename to site/notebooks/how_to/tests/run_tests/1_run_dataset_based_tests.ipynb index a901849c4b..263ee130ab 100644 --- a/site/notebooks/how_to/run_tests/1_run_dataset_based_tests.ipynb +++ b/site/notebooks/how_to/tests/run_tests/1_run_dataset_based_tests.ipynb @@ -14,7 +14,7 @@ "- Initialize a ValidMind dataset \n", "- Pass the dataset to the `run_test` fuction for any test that takes a `dataset` input\n", "\n", - "**We recommended that you first complete the [Explore tests](../explore_tests.ipynb) notebook,** to understand the basics of how to find and describe all the available tests in the ValidMind Library before moving on to this advanced guide.\n", + "**We recommended that you first complete the [Explore tests](../explore_tests/explore_tests.ipynb) notebook,** to understand the basics of how to find and describe all the available tests in the ValidMind Library before moving on to this advanced guide.\n", "\n", "This interactive notebook provides a step-by-step guide for listing and filtering available tests, building a sample dataset, initializing the required ValidMind objects, running the test, and then logging the results to ValidMind. " ] @@ -105,7 +105,7 @@ "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -170,8 +170,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." @@ -241,7 +239,7 @@ "\n", "Before we run a test, let's find a suitable test for this demonstration. Let's assume you want to generate the *pearson correlation matrix* for a dataset. A Pearson correlation matrix is a table that shows the [Pearson correlation coefficients](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient) between several variables. \n", "\n", - "In the [Explore tests](../explore_tests.ipynb) notebook, we learned how to pass a `filter` to the `list_tests` function. We'll do the same here to find the test ID for the pearson correlation matrix:" + "In the [Explore tests](../explore_tests/explore_tests.ipynb) notebook, we learned how to pass a `filter` to the `list_tests` function. We'll do the same here to find the test ID for the pearson correlation matrix:" ] }, { @@ -584,7 +582,7 @@ ], "metadata": { "kernelspec": { - "display_name": "3.10.13", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/site/notebooks/how_to/run_tests/2_run_comparison_tests.ipynb b/site/notebooks/how_to/tests/run_tests/2_run_comparison_tests.ipynb similarity index 97% rename from site/notebooks/how_to/run_tests/2_run_comparison_tests.ipynb rename to site/notebooks/how_to/tests/run_tests/2_run_comparison_tests.ipynb index 5aa8d78dba..4ce26f074b 100644 --- a/site/notebooks/how_to/run_tests/2_run_comparison_tests.ipynb +++ b/site/notebooks/how_to/tests/run_tests/2_run_comparison_tests.ipynb @@ -15,7 +15,7 @@ "- Initialize a ValidMind model and assign predictions to a dataset\n", "- Run a comparison test with `run_test` function\n", "\n", - "**We recommended that you first complete the [Explore tests](../explore_tests.ipynb) and the [Run dataset based tests](./1_run_dataset_based_tests.ipynb) notebooks** to understand the basics of how to find and describe all the available tests in the ValidMind Library and how to run tests before moving on to this guide.\n", + "**We recommended that you first complete the [Explore tests](../explore_tests/explore_tests.ipynb) and the [Run dataset based tests](./1_run_dataset_based_tests.ipynb) notebooks** to understand the basics of how to find and describe all the available tests in the ValidMind Library and how to run tests before moving on to this guide.\n", "\n", "This interactive notebook provides a step-by-step guide for listing and filtering available tests, building a sample dataset, training a model, initializing the required ValidMind objects, running a comparison test, and then logging the results to ValidMind. " ] @@ -113,7 +113,7 @@ "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -174,8 +174,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." @@ -267,7 +265,7 @@ "\n", "Before we run a comparison test, let's find a suitable test for this demonstration. Let's assume you want to evaluate the performance results for a model.\n", "\n", - "In the [Explore tests](../explore_tests.ipynb) notebook, we learned how to pass a `filter` to the `list_tests` function. We'll do the same here to find the test ID for the confusion matrix:" + "In the [Explore tests](../explore_tests/explore_tests.ipynb) notebook, we learned how to pass a `filter` to the `list_tests` function. We'll do the same here to find the test ID for the confusion matrix:" ] }, { diff --git a/site/notebooks/how_to/customize_test_result_descriptions.ipynb b/site/notebooks/how_to/tests/run_tests/configure_tests/customize_test_result_descriptions.ipynb similarity index 99% rename from site/notebooks/how_to/customize_test_result_descriptions.ipynb rename to site/notebooks/how_to/tests/run_tests/configure_tests/customize_test_result_descriptions.ipynb index c3aca95e16..ab36da9131 100644 --- a/site/notebooks/how_to/customize_test_result_descriptions.ipynb +++ b/site/notebooks/how_to/tests/run_tests/configure_tests/customize_test_result_descriptions.ipynb @@ -121,8 +121,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/how_to/enable_pii_detection.ipynb b/site/notebooks/how_to/tests/run_tests/configure_tests/enable_pii_detection.ipynb similarity index 99% rename from site/notebooks/how_to/enable_pii_detection.ipynb rename to site/notebooks/how_to/tests/run_tests/configure_tests/enable_pii_detection.ipynb index 3d648e82cf..2eb9ae0116 100644 --- a/site/notebooks/how_to/enable_pii_detection.ipynb +++ b/site/notebooks/how_to/tests/run_tests/configure_tests/enable_pii_detection.ipynb @@ -124,7 +124,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html))\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -289,7 +289,7 @@ "source": [ "
Want to learn more about custom tests?\n", "

\n", - "Check out our extended introduction to custom tests — Implement custom tests
" + "Check out our extended introduction to custom tests — Implement custom tests" ] }, { diff --git a/site/notebooks/how_to/filter_input_columns.ipynb b/site/notebooks/how_to/tests/run_tests/configure_tests/filter_input_columns.ipynb similarity index 100% rename from site/notebooks/how_to/filter_input_columns.ipynb rename to site/notebooks/how_to/tests/run_tests/configure_tests/filter_input_columns.ipynb diff --git a/site/notebooks/how_to/run_tests_that_require_multiple_datasets.ipynb b/site/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.ipynb similarity index 98% rename from site/notebooks/how_to/run_tests_that_require_multiple_datasets.ipynb rename to site/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.ipynb index d292c4461d..74e67f61b7 100644 --- a/site/notebooks/how_to/run_tests_that_require_multiple_datasets.ipynb +++ b/site/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.ipynb @@ -105,7 +105,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -170,8 +170,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/how_to/understand_utilize_rawdata.ipynb b/site/notebooks/how_to/tests/run_tests/configure_tests/understand_utilize_rawdata.ipynb similarity index 100% rename from site/notebooks/how_to/understand_utilize_rawdata.ipynb rename to site/notebooks/how_to/tests/run_tests/configure_tests/understand_utilize_rawdata.ipynb diff --git a/site/notebooks/how_to/document_multiple_results_for_the_same_test.ipynb b/site/notebooks/how_to/tests/run_tests/documentation_tests/document_multiple_results_for_the_same_test.ipynb similarity index 99% rename from site/notebooks/how_to/document_multiple_results_for_the_same_test.ipynb rename to site/notebooks/how_to/tests/run_tests/documentation_tests/document_multiple_results_for_the_same_test.ipynb index 74e04e1ab9..0feb618659 100644 --- a/site/notebooks/how_to/document_multiple_results_for_the_same_test.ipynb +++ b/site/notebooks/how_to/tests/run_tests/documentation_tests/document_multiple_results_for_the_same_test.ipynb @@ -110,7 +110,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -175,8 +175,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/how_to/run_documentation_sections.ipynb b/site/notebooks/how_to/tests/run_tests/documentation_tests/run_documentation_sections.ipynb similarity index 98% rename from site/notebooks/how_to/run_documentation_sections.ipynb rename to site/notebooks/how_to/tests/run_tests/documentation_tests/run_documentation_sections.ipynb index fb4412df95..0952c0c9ce 100644 --- a/site/notebooks/how_to/run_documentation_sections.ipynb +++ b/site/notebooks/how_to/tests/run_tests/documentation_tests/run_documentation_sections.ipynb @@ -103,7 +103,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -168,8 +168,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/how_to/run_documentation_tests_with_config.ipynb b/site/notebooks/how_to/tests/run_tests/documentation_tests/run_documentation_tests_with_config.ipynb similarity index 99% rename from site/notebooks/how_to/run_documentation_tests_with_config.ipynb rename to site/notebooks/how_to/tests/run_tests/documentation_tests/run_documentation_tests_with_config.ipynb index a96df1a5b1..548c020cd8 100644 --- a/site/notebooks/how_to/run_documentation_tests_with_config.ipynb +++ b/site/notebooks/how_to/tests/run_tests/documentation_tests/run_documentation_tests_with_config.ipynb @@ -107,7 +107,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -172,8 +172,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/how_to/use_dataset_model_objects.ipynb b/site/notebooks/how_to/use_dataset_model_objects.ipynb deleted file mode 100644 index 12431963c5..0000000000 --- a/site/notebooks/how_to/use_dataset_model_objects.ipynb +++ /dev/null @@ -1,1005 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Introduction to ValidMind Dataset and Model Objects\n", - "\n", - "When writing custom tests, it is essential to be aware of the interfaces of the ValidMind Dataset and ValidMind Model, which are used as input arguments.\n", - "\n", - "As a model developer, writing custom tests is beneficial when the ValidMind library lacks a built-in test for your specific needs. For example, a model might require new tests to evaluate specific aspects of the model or dataset based on a particular use case.\n", - "\n", - "This interactive notebook offers a detailed understanding of ValidMind objects and their use in writing custom tests. It introduces various interfaces provided by these objects and demonstrates how they can be leveraged to implement tests effortlessly." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - "- [About ValidMind](#toc1__) \n", - " - [Before you begin](#toc1_1__) \n", - " - [New to ValidMind?](#toc1_2__) \n", - " - [Key concepts](#toc1_3__) \n", - "- [Setting up](#toc2__) \n", - " - [Install the ValidMind Library](#toc2_1__) \n", - " - [Initialize the ValidMind Library](#toc2_2__) \n", - " - [Register sample model](#toc2_2_1__) \n", - " - [Apply documentation template](#toc2_2_2__) \n", - " - [Get your code snippet](#toc2_2_3__) \n", - "- [Load the demo dataset](#toc3__) \n", - " - [Prepocess the raw dataset](#toc3_1__) \n", - "- [Train a model for testing](#toc4__) \n", - "- [Explore basic components of the ValidMind library](#toc5__) \n", - " - [VMDataset Object](#toc5_1__) \n", - " - [Initialize the ValidMind datasets](#toc5_1_1__) \n", - " - [ Interfaces of the dataset object](#toc5_1_2__) \n", - " - [Using VM Dataset object as arguments in custom tests](#toc5_2__) \n", - " - [Run the test](#toc5_2_1__) \n", - " - [Using VM Dataset object and parameters as arguments in custom tests](#toc5_3__) \n", - " - [VMModel Object](#toc5_4__) \n", - " - [Initialize ValidMind model object](#toc5_5__) \n", - " - [Assign predictions to the datasets](#toc5_6__) \n", - " - [Using VM Model and Dataset objects as arguments in Custom tests](#toc5_7__) \n", - " - [Log the test results](#toc5_8__) \n", - "- [Where to go from here](#toc6__) \n", - " - [Use cases](#toc6_1__) \n", - " - [More how-to guides and code samples](#toc6_2__) \n", - " - [Discover more learning resources](#toc6_3__) \n", - "- [Upgrade ValidMind](#toc7__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language.\n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
\n", - "\n", - "\n", - "\n", - "### Key concepts\n", - "\n", - "Here, we will focus on ValidMind dataset, ValidMind model and tests to use these objects to generate artefacts for the documentation.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - "- **model**: A single ValidMind model object that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - "- **dataset**: Single ValidMind dataset object that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Dataset based Test**\n", - "\n", - "![Dataset based test architecture](./dataset_image.png)\n", - "The dataset based tests take VM dataset object(s) as inputs, test configuration as test parameters to produce `Outputs` as mentioned above.\n", - "\n", - "**Model based Test**\n", - "\n", - "![Model based test architecture](./model_image.png)\n", - "Similar to datasest based tests, the model based tests as an additional input that is VM model object. It allows to identify prediction values of a specific model in the dataset object. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Install the ValidMind Library\n", - "\n", - "Please note the following recommended Python versions to use:\n", - "\n", - "- Python 3.7 > x <= 3.11\n", - "\n", - "To install the library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Binary classification`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "metadata": {} - }, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "\n", - "import xgboost as xgb" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Load the demo dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.datasets.classification import customer_churn as demo_dataset\n", - "\n", - "raw_df = demo_dataset.load_data()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Prepocess the raw dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_df, validation_df, test_df = demo_dataset.preprocess(raw_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Train a model for testing\n", - "\n", - "We train a simple customer churn model for our test." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "x_train = train_df.drop(demo_dataset.target_column, axis=1)\n", - "y_train = train_df[demo_dataset.target_column]\n", - "x_val = validation_df.drop(demo_dataset.target_column, axis=1)\n", - "y_val = validation_df[demo_dataset.target_column]\n", - "\n", - "model = xgb.XGBClassifier(early_stopping_rounds=10)\n", - "model.set_params(\n", - " eval_metric=[\"error\", \"logloss\", \"auc\"],\n", - ")\n", - "model.fit(\n", - " x_train,\n", - " y_train,\n", - " eval_set=[(x_val, y_val)],\n", - " verbose=False,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Explore basic components of the ValidMind library\n", - "\n", - "In this section, you will learn about the basic objects of the ValidMind library that are necessary to implement both custom and built-in tests. As explained above, these objects are:\n", - "* VMDataset: [The high level APIs can be found here](https://docs.validmind.ai/validmind/validmind/vm_models.html#VMDataset)\n", - "* VMModel: [The high level APIs can be found here](https://docs.validmind.ai/validmind/validmind/vm_models.html#VMModel)\n", - "\n", - "Let's understand these objects and their interfaces step by step: " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### VMDataset Object" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Initialize the ValidMind datasets\n", - "\n", - "You can initialize a ValidMind dataset object using the [`init_dataset`](https://docs.validmind.ai/validmind/validmind.html#init_dataset) function from the ValidMind (`vm`) module.\n", - "\n", - "The function wraps the dataset to create a ValidMind `Dataset` object so that you can write tests effectively using the common interface provided by the VM objects. This step is always necessary every time you want to connect a dataset to documentation and produce test results through ValidMind. You only need to do it one time per dataset.\n", - "\n", - "This function takes a number of arguments. Some of the arguments are:\n", - "\n", - "- `dataset` — the raw dataset that you want to provide as input to tests\n", - "- `input_id` - a unique identifier that allows tracking what inputs are used when running each individual test\n", - "- `target_column` — a required argument if tests require access to true values. This is the name of the target column in the dataset\n", - "\n", - "The detailed list of the arguments can be found [here](https://docs.validmind.ai/validmind/validmind.html#init_dataset) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# vm_raw_dataset is now a VMDataset object that you can pass to any ValidMind test\n", - "vm_raw_dataset = vm.init_dataset(\n", - " dataset=raw_df,\n", - " input_id=\"raw_dataset\",\n", - " target_column=\"Exited\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once you have a ValidMind dataset object (VMDataset), you can inspect its attributes and methods using the inspect_obj utility module. This module provides a list of available attributes and interfaces for use in tests. Understanding how to use VMDatasets is crucial for comprehending how a custom test functions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.utils import inspect_obj\n", - "inspect_obj(vm_raw_dataset)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Interfaces of the dataset object\n", - "\n", - "**DataFrame**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_raw_dataset.df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Feature columns**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_raw_dataset.feature_columns" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Target column**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_raw_dataset.target_column" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Features values**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_raw_dataset.x_df()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Target value**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_raw_dataset.y_df()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Numeric feature columns** " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_raw_dataset.feature_columns_numeric" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Categorical feature columns** " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_raw_dataset.feature_columns_categorical" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Similarly, you can use all other interfaces of the [VMDataset objects](https://docs.validmind.ai/validmind/validmind/vm_models.html#VMDataset) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Using VM Dataset object as arguments in custom tests\n", - "\n", - "A custom test is simply a Python function that takes two types of arguments: `inputs` and `params`. The `inputs` are ValidMind objects (`VMDataset`, `VMModel`), and the `params` are additional parameters required for the underlying computation of the test. We will discuss both types of arguments in the following sections.\n", - "\n", - "Let's start with a custom test that requires only a ValidMind dataset object. In this example, we will check the balance of classes in the target column of the dataset:\n", - "\n", - "- The custom test below requires a single argument of type `VMDataset` (dataset).\n", - "- The `my_custom_tests.ClassImbalance` is a unique test identifier that can be assigned using the `vm.test` decorator functionality. This unique test ID will be used in the platform to load test results in the documentation.\n", - "- The `dataset.target_column` and `dataset.df` attributes of the `VMDataset` object are used in the test.\n", - "\n", - "Other high-level APIs (attributes and methods) of the dataset object are listed [here](https://docs.validmind.ai/validmind/validmind/vm_models.html#VMDataset).\n", - "\n", - "If you've gone through the [Implement custom tests notebook](../code_samples/custom_tests/implement_custom_tests.ipynb), you should have a good understanding of how custom tests are implemented in details. If you haven't, we recommend going through that notebook first." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.vm_models.dataset.dataset import VMDataset\n", - "import pandas as pd\n", - "\n", - "@vm.test(\"my_custom_tests.ClassImbalance\")\n", - "def class_imbalance(dataset):\n", - " # Can only run this test if we have a Dataset object\n", - " if not isinstance(dataset, VMDataset):\n", - " raise ValueError(\"ClassImbalance requires a validmind Dataset object\")\n", - "\n", - " if dataset.target_column is None:\n", - " print(\"Skipping class_imbalance test because no target column is defined\")\n", - " return\n", - "\n", - " # VMDataset object provides target_column attribute\n", - " target_column = dataset.target_column\n", - " # we can access pandas DataFrame using df attribute\n", - " imbalance_percentages = dataset.df[target_column].value_counts(\n", - " normalize=True\n", - " )\n", - " classes = list(imbalance_percentages.index) \n", - " percentages = list(imbalance_percentages.values * 100)\n", - "\n", - " return pd.DataFrame({\"Classes\":classes, \"Percentage\": percentages})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Run the test\n", - "\n", - "Let's run the test using the `run_test` method, which is part of the `validmind.tests` module. Here, we pass the `dataset` through the `inputs`. Similarly, you can pass `datasets`, `model`, or `models` as inputs if your custom test requires them. In this example below, we run the custom test `my_custom_tests.ClassImbalance` by passing the `dataset` through the `inputs`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.tests import run_test\n", - "result = run_test(\n", - " test_id=\"my_custom_tests.ClassImbalance\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can move custom tests into separate modules in a folder. It allows you to take one-off tests and move them into an organized structure that makes it easier to manage, maintain and share them. We have provided a seperate notebook with detailed explaination [here](../code_samples/custom_tests/integrate_external_test_providers.ipynb) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Using VM Dataset object and parameters as arguments in custom tests\n", - "\n", - "Simlilar to `inputs`, you can pass `params` to a custom test by providing a dictionary of parameters to the `run_test()` function. The parameters will override any default parameters set in the custom test definition. Note that the `dataset` is still passed as `inputs`. \n", - "Let's modify the class imbalance test so that it provides flexibility to `normalize` the results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.vm_models.dataset.dataset import VMDataset\n", - "import pandas as pd\n", - "\n", - "@vm.test(\"my_custom_tests.ClassImbalance\")\n", - "def class_imbalance(dataset, normalize=True):\n", - " # Can only run this test if we have a Dataset object\n", - " if not isinstance(dataset, VMDataset):\n", - " raise ValueError(\"ClassImbalance requires a validmind Dataset object\")\n", - "\n", - " if dataset.target_column is None:\n", - " print(\"Skipping class_imbalance test because no target column is defined\")\n", - " return\n", - "\n", - " # VMDataset object provides target_column attribute\n", - " target_column = dataset.target_column\n", - " # we can access pandas DataFrame using df attribute\n", - " imbalance_percentages = dataset.df[target_column].value_counts(\n", - " normalize=normalize\n", - " )\n", - " classes = list(imbalance_percentages.index) \n", - " if normalize: \n", - " result = pd.DataFrame({\"Classes\":classes, \"Percentage\": list(imbalance_percentages.values*100)})\n", - " else:\n", - " result = pd.DataFrame({\"Classes\":classes, \"Count\": list(imbalance_percentages.values)})\n", - " return result" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this example, the `normalize` parameter is set to `False`, so the class counts will not be normalized. You can change the value to `True` if you want the counts to be normalized. The results of the test will reflect this flexibility, allowing for different outputs based on the parameter passed.\n", - "\n", - "Here, we have passed the `dataset` through the `inputs` and the `normalize` parameter using the `params`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.tests import run_test\n", - "result = run_test(\n", - " test_id = \"my_custom_tests.ClassImbalance\",\n", - " inputs={\"dataset\": vm_raw_dataset},\n", - " params={\"normalize\": True},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### VMModel Object" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize ValidMind model object\n", - "\n", - "Similar to ValidMind `Dataset` object, you can initialize a ValidMind Model object using the [`init_model`](https://docs.validmind.ai/validmind/validmind.html#init_model) function from the ValidMind (`vm`) module.\n", - "\n", - "This function takes a number of arguments. Some of the arguments are:\n", - "\n", - "- `model` — the raw model that you want evaluate\n", - "- `input_id` - a unique identifier that allows tracking what inputs are used when running each individual test\n", - "\n", - "The detailed list of the arguments can be found [here](https://docs.validmind.ai/validmind/validmind.html#init_model) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "vm_model = vm.init_model(\n", - " model=model,\n", - " input_id=\"xgb_model\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's inspect the methods and attributes of the model now:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "inspect_obj(vm_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Assign predictions to the datasets\n", - "\n", - "We can now use the `assign_predictions()` method from the `Dataset` object to link existing predictions to any model. If no prediction values are passed, the method will compute predictions automatically:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_train_ds = vm.init_dataset(\n", - " input_id=\"train_dataset\",\n", - " dataset=train_df,\n", - " type=\"generic\",\n", - " target_column=demo_dataset.target_column,\n", - ")\n", - "\n", - "vm_train_ds.assign_predictions(model=vm_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can see below, the extra prediction column (`xgb_model_prediction`) for the model (`xgb_model`) has been added in the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(vm_train_ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Using VM Model and Dataset objects as arguments in Custom tests\n", - "\n", - "We will now create a `@vm.test` wrapper that will allow you to create a reusable test. Note the following changes in the code below:\n", - "\n", - "- The function `confusion_matrix` takes two arguments `dataset` and `model`. This is a `VMDataset` and `VMModel` object respectively.\n", - " - `VMDataset` objects allow you to access the dataset's true (target) values by accessing the `.y` attribute.\n", - " - `VMDataset` objects allow you to access the predictions for a given model by accessing the `.y_pred()` method.\n", - "- The function docstring provides a description of what the test does. This will be displayed along with the result in this notebook as well as in the ValidMind Platform.\n", - "- The function body calculates the confusion matrix using the `sklearn.tests.confusion_matrix` function as we just did above.\n", - "- The function then returns the `ConfusionMatrixDisplay.figure_` object - this is important as the ValidMind Library expects the output of the custom test to be a plot or a table.\n", - "- The `@vm.test` decorator is doing the work of creating a wrapper around the function that will allow it to be run by the ValidMind Library. It also registers the test so it can be found by the ID `my_custom_tests.ConfusionMatrix` (see the section below on how test IDs work in ValidMind and why this format is important)\n", - "\n", - "Similarly, you can use the functinality provided by `VMDataset` and `VMModel` objects. You can refer our documentation page for all the avalialble APIs [here](https://docs.validmind.ai/validmind/validmind.html#init_dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn import metrics\n", - "import matplotlib.pyplot as plt\n", - "@vm.test(\"my_custom_tests.ConfusionMatrix\")\n", - "def confusion_matrix(dataset, model):\n", - " \"\"\"The confusion matrix is a table that is often used to describe the performance of a classification model on a set of data for which the true values are known.\n", - "\n", - " The confusion matrix is a 2x2 table that contains 4 values:\n", - "\n", - " - True Positive (TP): the number of correct positive predictions\n", - " - True Negative (TN): the number of correct negative predictions\n", - " - False Positive (FP): the number of incorrect positive predictions\n", - " - False Negative (FN): the number of incorrect negative predictions\n", - "\n", - " The confusion matrix can be used to assess the holistic performance of a classification model by showing the accuracy, precision, recall, and F1 score of the model on a single figure.\n", - " \"\"\"\n", - " # we can retrieve traget value from dataset which is y attribute\n", - " y_true = dataset.y\n", - " # The prediction value of a specific model using y_pred method \n", - " y_pred = dataset.y_pred(model=model)\n", - "\n", - " confusion_matrix = metrics.confusion_matrix(y_true, y_pred)\n", - "\n", - " cm_display = metrics.ConfusionMatrixDisplay(\n", - " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", - " )\n", - " cm_display.plot()\n", - " plt.close()\n", - "\n", - " return cm_display.figure_ # return the figure object itself" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here, we run test using two inputs; `dataset` and `model`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.tests import run_test\n", - "result = run_test(\n", - " test_id = \"my_custom_tests.ConfusionMatrix\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " \"model\": vm_model,\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Log the test results\n", - "\n", - "You can log any test result to the ValidMind Platform with the `.log()` method of the result object. This will allow you to add the result to the documentation.\n", - "\n", - "You can now do the same for the confusion matrix results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Where to go from here\n", - "\n", - "In this notebook you have learned the end-to-end process to document a model with the ValidMind Library, running through some very common scenarios in a typical model development setting:\n", - "\n", - "- Running out-of-the-box tests\n", - "- Documenting your model by adding evidence to model documentation\n", - "- Extending the capabilities of the ValidMind Library by implementing custom tests\n", - "- Ensuring that the documentation is complete by running all tests in the documentation template\n", - "\n", - "As a next step, you can explore the following notebooks to get a deeper understanding on how the ValidMind Library allows you generate model documentation for any use case:\n", - "\n", - "\n", - "\n", - "### Use cases\n", - "\n", - "- [Document an application scorecard model](../code_samples/credit_risk/application_scorecard_full_suite.ipynb)\n", - "- [Linear regression documentation demo](../code_samples/regression/quickstart_regression_full_suite.ipynb)\n", - "- [LLM model documentation demo](../code_samples/nlp_and_llm/foundation_models_integration_demo.ipynb)\n", - "\n", - "\n", - "\n", - "### More how-to guides and code samples\n", - "\n", - "- [Explore available tests in detail](../how_to/explore_tests.ipynb)\n", - "- [In-depth guide for implementing custom tests](../code_samples/custom_tests/implement_custom_tests.ipynb)\n", - "- [In-depth guide to external test providers](../code_samples/custom_tests/integrate_external_test_providers.ipynb)\n", - "- [Configuring dataset features](../how_to/configure_dataset_features.ipynb)\n", - "- [Introduction to unit and composite tests](../how_to/run_unit_metrics.ipynb)\n", - "\n", - "\n", - "\n", - "### Discover more learning resources\n", - "\n", - "All notebook samples can be found in the following directories of the ValidMind Library GitHub repository:\n", - "\n", - "- [Code samples](https://github.com/validmind/validmind-library/tree/main/notebooks/code_samples)\n", - "- [How-to guides](https://github.com/validmind/validmind-library/tree/main/notebooks/how_to)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Upgrade ValidMind\n", - "\n", - "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", - "\n", - "Retrieve the information for the currently installed version of ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip show validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", - "\n", - "```bash\n", - "%pip install --upgrade validmind\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You may need to restart your kernel after running the upgrade package for changes to be applied." - ] - }, - { - "cell_type": "markdown", - "id": "copyright-340a990e20194848af0efb0c965e219a", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/site/notebooks/images/composite-metric-in-template-preview.png b/site/notebooks/images/composite-metric-in-template-preview.png deleted file mode 100644 index e948fb901b..0000000000 Binary files a/site/notebooks/images/composite-metric-in-template-preview.png and /dev/null differ diff --git a/site/notebooks/images/high-pearson-correlation-block.png b/site/notebooks/images/high-pearson-correlation-block.png deleted file mode 100644 index dbe44392d1..0000000000 Binary files a/site/notebooks/images/high-pearson-correlation-block.png and /dev/null differ diff --git a/site/notebooks/images/insert-test-driven-block-correlations.png b/site/notebooks/images/insert-test-driven-block-correlations.png deleted file mode 100644 index 7ebb804881..0000000000 Binary files a/site/notebooks/images/insert-test-driven-block-correlations.png and /dev/null differ diff --git a/site/notebooks/images/insert-test-driven-block-custom-class-imbalance.jpg b/site/notebooks/images/insert-test-driven-block-custom-class-imbalance.jpg deleted file mode 100644 index 6a8fe31e87..0000000000 Binary files a/site/notebooks/images/insert-test-driven-block-custom-class-imbalance.jpg and /dev/null differ diff --git a/site/notebooks/images/insert-test-driven-block-custom-confusion-matrix.png b/site/notebooks/images/insert-test-driven-block-custom-confusion-matrix.png deleted file mode 100644 index 770354af86..0000000000 Binary files a/site/notebooks/images/insert-test-driven-block-custom-confusion-matrix.png and /dev/null differ diff --git a/site/notebooks/images/my_tests_directory.png b/site/notebooks/images/my_tests_directory.png deleted file mode 100644 index 47baffe80e..0000000000 Binary files a/site/notebooks/images/my_tests_directory.png and /dev/null differ diff --git a/site/notebooks/images/selecting-composite-metric.png b/site/notebooks/images/selecting-composite-metric.png deleted file mode 100644 index 3c741b8275..0000000000 Binary files a/site/notebooks/images/selecting-composite-metric.png and /dev/null differ diff --git a/site/notebooks/images/selecting-high-pearson-correlation-test.png b/site/notebooks/images/selecting-high-pearson-correlation-test.png deleted file mode 100644 index 78a4cfd98a..0000000000 Binary files a/site/notebooks/images/selecting-high-pearson-correlation-test.png and /dev/null differ diff --git a/site/notebooks/quickstart/quickstart_model_documentation.ipynb b/site/notebooks/quickstart/quickstart_model_documentation.ipynb index e88d9a8036..0128601870 100644 --- a/site/notebooks/quickstart/quickstart_model_documentation.ipynb +++ b/site/notebooks/quickstart/quickstart_model_documentation.ipynb @@ -152,7 +152,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html))\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -226,8 +226,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/quickstart/quickstart_model_validation.ipynb b/site/notebooks/quickstart/quickstart_model_validation.ipynb index ffa1bc42ae..4621eb0d42 100644 --- a/site/notebooks/quickstart/quickstart_model_validation.ipynb +++ b/site/notebooks/quickstart/quickstart_model_validation.ipynb @@ -159,7 +159,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html))\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -195,8 +195,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down — don’t worry, we’ll adjust these permissions next for validation.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/tutorials/model_development/1-set_up_validmind.ipynb b/site/notebooks/tutorials/model_development/1-set_up_validmind.ipynb index f85b592d8a..61ee21e2b5 100644 --- a/site/notebooks/tutorials/model_development/1-set_up_validmind.ipynb +++ b/site/notebooks/tutorials/model_development/1-set_up_validmind.ipynb @@ -139,7 +139,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html))\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -215,8 +215,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/tutorials/model_development/2-start_development_process.ipynb b/site/notebooks/tutorials/model_development/2-start_development_process.ipynb index 1d62e9c085..23442b3d30 100644 --- a/site/notebooks/tutorials/model_development/2-start_development_process.ipynb +++ b/site/notebooks/tutorials/model_development/2-start_development_process.ipynb @@ -221,7 +221,7 @@ "source": [ "
Want to learn more about navigating ValidMind tests?\n", "

\n", - "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests
" + "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests" ] }, { diff --git a/site/notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb b/site/notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb index 49df9a5b50..9a3cbe7ef7 100644 --- a/site/notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb +++ b/site/notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb @@ -13,7 +13,7 @@ "- The function can be as simple or as complex as you need it to be — it can use external libraries, make API calls, or do anything else that you can do in Python.\n", "- The only requirement is that the function signature and return values can be \"understood\" and handled by the ValidMind Library. As such, custom tests offer added flexibility by extending the default tests provided by ValidMind, enabling you to document any type of model or use case.\n", "\n", - "**For a more in-depth introduction to custom tests,** refer to our [Implement custom tests](../../code_samples/custom_tests/implement_custom_tests.ipynb) notebook.\n", + "**For a more in-depth introduction to custom tests,** refer to our [Implement custom tests](../../how_to/tests/custom_tests/implement_custom_tests.ipynb) notebook.\n", "\n", "
Learn by doing\n", "

\n", @@ -817,7 +817,7 @@ "\n", "
Want to learn more about test providers?\n", "

\n", - "An extended introduction to test providers can be found in: Integrate external test providers
" + "An extended introduction to test providers can be found in: Integrate external test providers
" ] }, { diff --git a/site/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb b/site/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb index 0af95d90a7..04c745225e 100644 --- a/site/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb +++ b/site/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb @@ -930,9 +930,9 @@ "\n", "#### Use cases\n", "\n", - "- [Document an application scorecard model](../../code_samples/credit_risk/application_scorecard_full_suite.ipynb)\n", - "- [Linear regression documentation demo](../../code_samples/regression/quickstart_regression_full_suite.ipynb)\n", - "- [LLM model documentation demo](../../code_samples/nlp_and_llm/foundation_models_integration_demo.ipynb)" + "- [Document an application scorecard model](../../use_cases/credit_risk/application_scorecard_full_suite.ipynb)\n", + "- [Linear regression documentation demo](../../use_cases/regression/quickstart_regression_full_suite.ipynb)\n", + "- [LLM model documentation demo](../../use_cases/nlp_and_llm/foundation_models_integration_demo.ipynb)" ] }, { @@ -943,12 +943,12 @@ "\n", "#### More how-to guides and code samples\n", "\n", - "- [Explore available tests in detail](../../how_to/explore_tests.ipynb)\n", - "- [In-depth guide on running dataset based tests](../../how_to/run_tests/1_run_dataset_based_tests.ipynb)\n", - "- [In-depth guide for implementing custom tests](../../code_samples/custom_tests/implement_custom_tests.ipynb)\n", - "- [In-depth guide to external test providers](../../code_samples/custom_tests/integrate_external_test_providers.ipynb)\n", - "- [Configuring dataset features](../../how_to/configure_dataset_features.ipynb)\n", - "- [Introduction to unit and composite metrics](../../how_to/run_unit_metrics.ipynb)" + "- [Explore available tests in detail](../../how_to/tests/explore_tests/explore_tests.ipynb)\n", + "- [In-depth guide on running dataset based tests](../../how_to/tests/run_tests/1_run_dataset_based_tests.ipynb)\n", + "- [In-depth guide for implementing custom tests](../../how_to/tests/custom_tests/implement_custom_tests.ipynb)\n", + "- [In-depth guide to external test providers](../../how_to/tests/custom_tests/integrate_external_test_providers.ipynb)\n", + "- [Configuring dataset features](../../how_to/data_and_datasets/dataset_inputs/configure_dataset_features.ipynb)\n", + "- [Introduction to unit and composite metrics](../../how_to/metrics/run_unit_metrics.ipynb)" ] }, { @@ -961,7 +961,7 @@ "\n", "All notebook samples can be found in the following directories of the ValidMind Library GitHub repository:\n", "\n", - "- [Code samples](https://github.com/validmind/validmind-library/tree/main/notebooks/code_samples)\n", + "- [Use cases](https://github.com/validmind/validmind-library/tree/main/notebooks/use_cases)\n", "- [How-to guides](https://github.com/validmind/validmind-library/tree/main/notebooks/how_to)" ] }, diff --git a/site/notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb b/site/notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb index b9154b80c2..6a2e9e128a 100644 --- a/site/notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb +++ b/site/notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb @@ -141,7 +141,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html))\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -177,8 +177,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down — don’t worry, we’ll adjust these permissions next for validation.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/tutorials/model_validation/2-start_validation_process.ipynb b/site/notebooks/tutorials/model_validation/2-start_validation_process.ipynb index 48e4dbff66..77383ce0f2 100644 --- a/site/notebooks/tutorials/model_validation/2-start_validation_process.ipynb +++ b/site/notebooks/tutorials/model_validation/2-start_validation_process.ipynb @@ -237,7 +237,7 @@ "source": [ "
Want to learn more about navigating ValidMind tests?\n", "

\n", - "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests
" + "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests" ] }, { diff --git a/site/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb b/site/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb index fe4e221c90..6103fa2d41 100644 --- a/site/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb +++ b/site/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb @@ -13,7 +13,7 @@ "- The function can be as simple or as complex as you need it to be — it can use external libraries, make API calls, or do anything else that you can do in Python.\n", "- The only requirement is that the function signature and return values can be \"understood\" and handled by the ValidMind Library. As such, custom tests offer added flexibility by extending the default tests provided by ValidMind, enabling you to document any type of model or use case.\n", "\n", - "**For a more in-depth introduction to custom tests,** refer to our [Implement custom tests](../../code_samples/custom_tests/implement_custom_tests.ipynb) notebook.\n", + "**For a more in-depth introduction to custom tests,** refer to our [Implement custom tests](../../how_to/tests/custom_tests/implement_custom_tests.ipynb) notebook.\n", "\n", "
Learn by doing\n", "

\n", @@ -480,7 +480,7 @@ "\n", "
Want to learn more about custom tests?\n", "

\n", - "Refer to our in-depth introduction to custom tests: Implement custom tests
" + "Refer to our in-depth introduction to custom tests: Implement custom tests
" ] }, { @@ -856,7 +856,7 @@ "\n", "
Want to learn more about test providers?\n", "

\n", - "An extended introduction to test providers can be found in: Integrate external test providers
" + "An extended introduction to test providers can be found in: Integrate external test providers" ] }, { @@ -1176,10 +1176,10 @@ "\n", "#### More how-to guides and code samples\n", "\n", - "- [Explore available tests in detail](../../how_to/explore_tests.ipynb)\n", - "- [In-depth guide on running dataset based tests](../../how_to/run_tests/1_run_dataset_based_tests.ipynb)\n", - "- [In-depth guide for running comparison tests](../../how_to/run_tests/2_run_comparison_tests.ipynb)\n", - "- [In-depth guide for implementing custom tests](../../code_samples/custom_tests/implement_custom_tests.ipynb)" + "- [Explore available tests in detail](../../how_to/tests/explore_tests/explore_tests.ipynb)\n", + "- [In-depth guide on running dataset based tests](../../how_to/tests/run_tests/1_run_dataset_based_tests.ipynb)\n", + "- [In-depth guide for running comparison tests](../../how_to/tests/run_tests/2_run_comparison_tests.ipynb)\n", + "- [In-depth guide for implementing custom tests](../../how_to/tests/custom_tests/implement_custom_tests.ipynb)" ] }, { @@ -1192,7 +1192,7 @@ "\n", "All notebook samples can be found in the following directories of the ValidMind Library GitHub repository:\n", "\n", - "- [Code samples](https://github.com/validmind/validmind-library/tree/main/notebooks/code_samples)\n", + "- [Use cases](https://github.com/validmind/validmind-library/tree/main/notebooks/use_cases)\n", "- [How-to guides](https://github.com/validmind/validmind-library/tree/main/notebooks/how_to)\n", "\n", "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." diff --git a/site/notebooks/use_cases/agents/agentic_ai_template.yaml b/site/notebooks/use_cases/agents/agentic_ai_template.yaml new file mode 100644 index 0000000000..06ef71c9c2 --- /dev/null +++ b/site/notebooks/use_cases/agents/agentic_ai_template.yaml @@ -0,0 +1,343 @@ +# Copyright © 2023-2026 ValidMind Inc. All rights reserved. +# Refer to the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial + +- id: executive_summary + title: Executive Summary + guidelines: + - Provide a high-level overview of the agentic AI system, including its + purpose, scope, and intended use cases. + - Summarize the key features that make the system agentic, such as autonomy, + reasoning, memory, adaptability, and goal-directed behavior. + - Highlight the strategic benefits for the organization, such as efficiency, + scalability, cost-effectiveness, and decision-making support. + - Outline the system’s testing and validation strategy at a glance, + emphasizing safety, reliability, and regulatory compliance. + - Identify major risks, limitations, and safeguards, giving stakeholders a + concise understanding of governance and monitoring plans. + - Present the deployment vision, including expected stakeholders, + operational environments, and integration with existing workflows. + index_only: true +- id: conceptual_soundness + title: Conceptual Soundness + index_only: true + sections: + - id: model_overview + title: Model Overview + guidelines: + - Provide a concise explanation of the system’s purpose, including how + the agentic AI framework enables autonomous decision-making, + reasoning, and action-taking. + - Describe the high-level design of the agent(s), their core objectives, + and how they interact with their environment and users. + - Explain the conceptual differences between this agentic system and + traditional AI/ML models, focusing on autonomy, adaptability, and + emergent behavior. + - Highlight the role of agency, memory, feedback loops, and + goal-directedness in the system’s operation. + - Summarize the overall vision for how the system is intended to be + applied in real-world contexts, along with high-level testing goals. + parent_section: conceptual_soundness + - id: model_selection + title: Model Selection + guidelines: + - Describe the agentic AI paradigm, reasoning algorithms, or frameworks + chosen (e.g., reinforcement learning, planning, LLM-based + orchestration) and why they are suitable for the use case. + - Explain how the selected approach supports autonomy, adaptability, and + safe delegation of decision-making to the agent. + - Compare alternative paradigms (e.g., rule-based agents, purely + supervised ML models) and clarify why they were less appropriate. + - Discuss any hybrid approaches (e.g., combining symbolic reasoning with + generative models) and the rationale for customization. + - Identify potential risks and trade-offs of the chosen approach, + including known failure modes, and describe how these will be tested + and validated. + parent_section: conceptual_soundness + contents: + - content_id: model_selection + content_type: text + - id: purpose_and_scope + title: Purpose and Scope + guidelines: + - Clearly define the primary goals of the agentic AI system, including + decision-making domains and problem boundaries. + - Specify intended users, stakeholders, and environments where the agent + will operate. + - Identify the scope of autonomy granted to the agent (e.g., advisory + role, execution authority, or fully autonomous operation). + - Clarify the operational limits and scenarios where human oversight, + intervention, or escalation is required. + - Define measurable testing objectives that validate the agent’s + performance within its declared scope. + parent_section: conceptual_soundness + - id: architecture_at_glance + title: Architecture at Glance + guidelines: + - Provide a high-level diagram or description of the system + architecture, including agents, memory, reasoning modules, and + communication channels. + - Explain how the architecture supports perception, reasoning, planning, + and action loops. + - Highlight integration points with external systems, APIs, or data + sources. + - Describe the flow of information and control, showing how decisions + are formed, validated, and executed. + - Summarize testing hooks or checkpoints across components to enable + unit, integration, and system-level evaluation. + parent_section: conceptual_soundness + - id: assumptions_and_limitations + title: Assumptions and Limitations + guidelines: + - List the explicit assumptions about the environment, data, and user + behavior that underpin the system’s design. + - Identify constraints in agent reasoning, knowledge scope, or autonomy + that may affect performance. + - Discuss limitations in generalizability across contexts, domains, or + environments. + - Describe how uncertainty, incomplete information, or conflicting + objectives are handled. + - Explain how assumptions and limitations are validated through stress + tests, adversarial scenarios, and edge-case evaluations. + parent_section: conceptual_soundness + - id: regulatory_requirements + title: Regulatory Requirements + guidelines: + - Identify relevant laws, regulations, and standards applicable to + autonomous decision-making systems in the financial or operational + domain. + - Explain how the system addresses compliance needs such as + auditability, explainability, fairness, and accountability. + - Clarify how human oversight and control are integrated to meet + regulatory expectations for autonomous AI. + - Highlight any specific documentation, logging, or reporting features + built into the system for compliance purposes. + - Describe testing procedures to validate regulatory compliance, + including audit trail verification and explainability checks. + parent_section: conceptual_soundness +- id: data_preparation + title: Data Evaluation + index_only: true + sections: + - id: data_description + title: Data Description + guidelines: + - Provide an overview of data sources used by the agent(s), including + structured, unstructured, streaming, or interaction-derived data. + - Describe how contextual, environmental, or feedback data is + incorporated into the agent’s reasoning processes. + - Explain how memory structures (short-term, long-term, episodic) depend + on or interact with data inputs. + - Detail preprocessing or feature engineering tailored to enable + reasoning, planning, or adaptation. + - Include validation procedures to confirm data relevance, + representativeness, and adequacy for agent training and testing. + parent_section: data_preparation + - id: data_quality + title: Data Quality + guidelines: + - Define quality requirements for agent inputs, including accuracy, + timeliness, and consistency of real-world data streams. + - Describe methods for detecting and handling incomplete, noisy, or + adversarial data. + - Explain quality control for interaction data (e.g., user prompts, + feedback) that may shape agent behavior. + - Highlight processes for maintaining integrity of memory stores and + preventing drift due to poor input quality. + - Include testing protocols for validating data pipelines, + stress-testing with edge cases, and detecting bias leakage. + parent_section: data_preparation + contents: [] +- id: model_evaluation + title: Model Evaluation + index_only: true + sections: + - id: model_description + title: Model Description + guidelines: + - Provide a clear description of the agent’s architecture, reasoning + cycle, and interaction model. + - Explain the roles of planning, memory, and feedback in enabling + autonomy and adaptability. + - Detail how subcomponents (e.g., LLMs, planners, evaluators) integrate + to achieve end-to-end functionality. + - Clarify how emergent behaviors are monitored and managed. + - Specify test coverage for each component, including unit tests, + integration tests, and system-level tests. + parent_section: model_evaluation + - id: evaluation_methodology + title: Evaluation Methodology + guidelines: + - Describe the evaluation framework for testing autonomy, adaptability, + and goal alignment. + - Specify metrics for reasoning quality, task success, efficiency, and + safety. + - Explain simulation, sandboxing, or staged deployment approaches used + for testing. + - Include stress-testing for unexpected inputs, adversarial prompts, or + dynamic environments. + - Define reproducibility and benchmarking protocols to validate results + consistently across test cycles. + parent_section: model_evaluation + - id: prompt_evaluation + title: Prompt Evaluation + guidelines: + - Describe how the system’s responses to prompts are evaluated for + relevance, accuracy, and safety. + - Explain methods for detecting prompt injection, manipulation, or + adversarial use. + - Detail how evaluation ensures robustness against ambiguous, + conflicting, or incomplete instructions. + - Clarify criteria for determining when escalation to human oversight is + required. + - Define testing strategies for prompt templates, prompt chaining, and + stress scenarios. + contents: + - content_type: test + content_id: validmind.prompt_validation.Clarity + - content_type: test + content_id: validmind.prompt_validation.Conciseness + - content_type: test + content_id: validmind.prompt_validation.Delimitation + - content_type: test + content_id: validmind.prompt_validation.NegativeInstruction + - content_type: test + content_id: validmind.prompt_validation.Specificity + parent_section: model_evaluation + - id: agent_evaluation + title: Agent Evaluation + guidelines: + - Provide methods for assessing the agent’s ability to reason, plan, and + act autonomously. + - Define success metrics such as goal completion rate, adaptability to + change, and alignment with human intent. + - Explain how unintended or emergent behaviors are identified and + evaluated. + - Include testing for multi-agent interactions, collaboration, or + conflict resolution. + - Describe adversarial and edge-case testing to validate resilience of + autonomous decision-making. + contents: + - content_type: test + content_id: my_custom_tests.banking_accuracy_test + - content_type: test + content_id: my_custom_tests.BankingToolCallAccuracy + parent_section: model_evaluation + - id: output_quality + title: Output Quality + guidelines: + - Define quality standards for agent outputs (e.g., recommendations, + actions, reports). + - Evaluate outputs for consistency, accuracy, and contextual + appropriateness. + - Assess outputs for fairness, non-discrimination, and alignment with + ethical principles. + - Include processes for handling uncertainty or probabilistic reasoning + in outputs. + - Develop automated test suites to benchmark output quality against gold + standards or domain experts. + contents: + - content_type: test + content_id: validmind.model_validation.ragas.Faithfulness + - content_type: test + content_id: validmind.model_validation.ragas.ResponseRelevancy + - content_type: test + content_id: validmind.model_validation.ragas.ContextRecall + parent_section: model_evaluation + - id: Safety + title: Safety + guidelines: + - Describe built-in safety mechanisms to prevent harmful or unintended + actions by the agent. + - Explain escalation protocols for high-risk decisions requiring human + oversight. + - Detail adversarial robustness testing and red-teaming efforts to + uncover vulnerabilities. + - Clarify methods for ensuring alignment with ethical, legal, and + organizational safety standards. + - Include continuous validation tests for safety boundaries under + evolving data and environment conditions. + contents: + - content_type: test + content_id: validmind.model_validation.ragas.AspectCritic + - content_type: test + content_id: validmind.prompt_validation.Bias + - content_type: test + content_id: validmind.data_validation.nlp.Toxicity + parent_section: model_evaluation + - id: reliability_resilience_and_degraded_modes + title: Reliability, Resilience and Degraded Modes + guidelines: + - Explain strategies to ensure continuity of service during system or + environment disruptions. + - Describe fallback behaviors, degraded modes, or safe defaults when + full autonomy is not possible. + - Detail resilience mechanisms for handling network, data, or + computational failures. + - Provide monitoring methods for detecting and recovering from system + instability or drift. + - Define test scenarios simulating degraded conditions to validate + graceful failure and recovery. + parent_section: model_evaluation + - id: c46a7162-5fcd-4d2f-87e2-084afae70ee9 + title: Actor specific Results + parent_section: model_evaluation + contents: [] + sections: + - id: e78c8564-5af1-4ecc-b200-f131a629a01c + title: Credit Risk Analyzer + parent_section: c46a7162-5fcd-4d2f-87e2-084afae70ee9 + contents: [] + - id: df36a0c3-be44-4e16-a59a-cb635eac3ff3 + title: Customer Account Manager + parent_section: c46a7162-5fcd-4d2f-87e2-084afae70ee9 + contents: [] + - id: 67d25cc5-2569-4727-aae1-6c5b2f84e238 + title: Fraud Detection System + parent_section: c46a7162-5fcd-4d2f-87e2-084afae70ee9 + contents: [] + - id: cost_and_performance_management + title: Cost and Performance Management + guidelines: + - Provide metrics for computational efficiency, resource utilization, + and scalability of the system. + - Explain trade-offs between autonomy, performance, and resource + consumption. + - Detail monitoring of infrastructure costs, particularly in multi-agent + or large-scale deployments. + - Describe optimization strategies for balancing responsiveness with + efficiency. + - Include load testing, latency measurement, and profiling to validate + scalability and cost-effectiveness. + parent_section: model_evaluation +- id: observability_and_monitoring + title: Observability and Monitoring + index_only: true + sections: + - id: monitoring_plan + title: Monitoring Plan + guidelines: + - Describe monitoring practices for reasoning quality, autonomy + boundaries, and safety compliance. + - Define triggers or alerts for deviations in agent behavior, output + quality, or ethical alignment. + - Explain feedback mechanisms for continuous improvement, retraining, or + realignment. + - Detail governance processes overseeing the monitoring, including human + review cycles. + - Specify testing protocols for validating monitoring tools, anomaly + detection, and alert reliability. + parent_section: observability_and_monitoring + - id: remediation_plan + title: Remediation Plan + guidelines: + - Provide steps for addressing performance degradation, misalignment, or + unsafe behaviors. + - Define escalation protocols and roles for intervention when agent + behavior breaches acceptable limits. + - Describe rollback strategies to revert to prior safe versions or modes. + - Explain retraining or recalibration processes when monitoring + identifies issues. + - Include regular scenario-based testing to validate the effectiveness + of remediation and recovery procedures. + parent_section: observability_and_monitoring diff --git a/site/notebooks/code_samples/agents/banking_test_dataset.py b/site/notebooks/use_cases/agents/banking_test_dataset.py similarity index 53% rename from site/notebooks/code_samples/agents/banking_test_dataset.py rename to site/notebooks/use_cases/agents/banking_test_dataset.py index bd2793169a..895b1e97f3 100644 --- a/site/notebooks/code_samples/agents/banking_test_dataset.py +++ b/site/notebooks/use_cases/agents/banking_test_dataset.py @@ -12,14 +12,6 @@ "session_id": str(uuid.uuid4()), "category": "credit_risk" }, - { - "input": "Evaluate credit risk for a business loan of $250,000 with monthly revenue of $85,000 and existing debt of $45,000 and credit score of 650", - "expected_tools": ["credit_risk_analyzer"], - "possible_outputs": ["MEDIUM RISK", "HIGH RISK", "business loan", "debt service coverage ratio", "1.8", "annual revenue", "$1,020,000", "risk score", "650"], - "expected_output": "MEDIUM RISK", # Example, adjust as needed - "session_id": str(uuid.uuid4()), - "category": "credit_risk" - }, { "input": "Check account balance for checking account 12345", "expected_tools": ["customer_account_manager"], @@ -45,29 +37,5 @@ "expected_output": "High-Yield Savings Account (2.5% APY)", # Example, adjust as needed "session_id": str(uuid.uuid4()), "category": "account_management" - }, - { - "input": "Investigate suspicious transactions totaling $75,000 across multiple accounts in the last week", - "expected_tools": ["fraud_detection_system"], - "possible_outputs": ["Require additional verification", "Implement 24-hour delay for verification"], - "expected_output": "Require additional verification", # Example, adjust as needed - "session_id": str(uuid.uuid4()), - "category": "fraud_detection" - }, - { - "input": "Assess credit risk for a $1,000,000 commercial real estate loan with $500,000 annual business income", - "expected_tools": ["credit_risk_analyzer"], - "possible_outputs": ["HIGH RISK", "VERY HIGH RISK", "loan-to-value", "66.7%", "debt service coverage", "2.0"], - "expected_output": "HIGH RISK", # Example, adjust as needed - "session_id": str(uuid.uuid4()), - "category": "credit_risk" - }, - { - "input": "Update customer contact information and address for account holder 22334", - "expected_tools": ["customer_account_manager"], - "possible_outputs": ["not found in system", "Customer ID 22334 not found in system.", "not found"], - "expected_output": "Customer ID 22334 not found in system.", # Example, adjust as needed - "session_id": str(uuid.uuid4()), - "category": "account_management" } ]) diff --git a/site/notebooks/code_samples/agents/banking_tools.py b/site/notebooks/use_cases/agents/banking_tools.py similarity index 99% rename from site/notebooks/code_samples/agents/banking_tools.py rename to site/notebooks/use_cases/agents/banking_tools.py index eb9bb00767..a3f08c6ebf 100644 --- a/site/notebooks/code_samples/agents/banking_tools.py +++ b/site/notebooks/use_cases/agents/banking_tools.py @@ -279,8 +279,8 @@ def _handle_recommend_product(customer): def _handle_get_info(customer, customer_id): """Handle get info action.""" - credit_tier = ('Excellent' if customer['credit_score'] >= 750 else - 'Good' if customer['credit_score'] >= 700 else + credit_tier = ('Excellent' if customer['credit_score'] >= 750 else + 'Good' if customer['credit_score'] >= 700 else 'Fair' if customer['credit_score'] >= 650 else 'Poor') return f"""CUSTOMER ACCOUNT INFORMATION diff --git a/site/notebooks/use_cases/agents/document_agentic_ai.ipynb b/site/notebooks/use_cases/agents/document_agentic_ai.ipynb new file mode 100644 index 0000000000..4b1a6fb7a4 --- /dev/null +++ b/site/notebooks/use_cases/agents/document_agentic_ai.ipynb @@ -0,0 +1,2197 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e7277c38", + "metadata": {}, + "source": [ + "# Document an agentic AI system\n", + "\n", + "Build and document an agentic AI system with the ValidMind Library. Construct a LangGraph-based banking agent, assign AI evaluation metric scores to your agent, and run accuracy, RAGAS, and safety tests, then log those test results to the ValidMind Platform.\n", + "\n", + "An _AI agent_ is an autonomous system that interprets inputs, selects from available tools or actions, and executes multi-step behaviors to achieve defined goals. In this notebook, the agent acts as a banking assistant that analyzes user requests and automatically selects and invokes the appropriate specialized banking tool to deliver accurate, compliant, and actionable responses.\n", + "\n", + "- This agent enables financial institutions to automate complex banking workflows where different customer requests require different specialized tools and knowledge bases.\n", + "- Effective validation of agentic AI systems reduces the risks of agents misinterpreting inputs, failing to extract required parameters, or producing incorrect assessments or actions — such as selecting the wrong tool.\n", + "\n", + "
For the LLM components in this notebook to function properly, you'll need access to OpenAI.\n", + "

\n", + "Before you continue, ensure that a valid OPENAI_API_KEY is set in your .env file.
" + ] + }, + { + "cell_type": "markdown", + "id": "a47dd942", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1__) \n", + " - [Before you begin](#toc1_1__) \n", + " - [New to ValidMind?](#toc1_2__) \n", + " - [Key concepts](#toc1_3__) \n", + "- [Setting up](#toc2__) \n", + " - [Install the ValidMind Library](#toc2_1__) \n", + " - [Initialize the ValidMind Library](#toc2_2__) \n", + " - [Register sample model](#toc2_2_1__) \n", + " - [Apply documentation template](#toc2_2_2__) \n", + " - [Get your code snippet](#toc2_2_3__) \n", + " - [Preview the documentation template](#toc2_2_4__) \n", + " - [Verify OpenAI API access](#toc2_3__) \n", + " - [Initialize the Python environment](#toc2_4__) \n", + "- [Building the LangGraph agent](#toc3__) \n", + " - [Test available banking tools](#toc3_1__) \n", + " - [Create LangGraph banking agent](#toc3_2__) \n", + " - [Define system prompt](#toc3_2_1__) \n", + " - [Initialize the LLM](#toc3_2_2__) \n", + " - [Define agent state structure](#toc3_2_3__) \n", + " - [Create agent workflow function](#toc3_2_4__) \n", + " - [Instantiate the banking agent](#toc3_2_5__) \n", + " - [Integrate agent with ValidMind](#toc3_3__) \n", + " - [Import ValidMind components](#toc3_3_1__) \n", + " - [Create agent wrapper function](#toc3_3_2__) \n", + " - [Initialize the ValidMind model object](#toc3_3_3__) \n", + " - [Store the agent reference](#toc3_3_4__) \n", + " - [Verify integration](#toc3_3_5__) \n", + " - [Validate the system prompt](#toc3_4__) \n", + "- [Initialize the ValidMind datasets](#toc4__) \n", + " - [Assign predictions](#toc4_1__) \n", + "- [Running accuracy tests](#toc5__) \n", + " - [Response accuracy test](#toc5_1__) \n", + " - [Tool selection accuracy test](#toc5_2__) \n", + "- [Assigning AI evaluation metric scores](#toc6__) \n", + " - [Identify relevant DeepEval scorers](#toc6_1__) \n", + " - [Assign reasoning scores](#toc6_2__) \n", + " - [Plan quality score](#toc6_2_1__) \n", + " - [Plan adherence score](#toc6_2_2__) \n", + " - [Assign action scores](#toc6_3__) \n", + " - [Tool correctness score](#toc6_3_1__) \n", + " - [Argument correctness score](#toc6_3_2__) \n", + " - [Assign execution scores](#toc6_4__) \n", + " - [Task completion score](#toc6_4_1__) \n", + "- [Running RAGAS tests](#toc7__) \n", + " - [Identify relevant RAGAS tests](#toc7_1__) \n", + " - [Faithfulness](#toc7_1_1__) \n", + " - [Response Relevancy](#toc7_1_2__) \n", + " - [Context Recall](#toc7_1_3__) \n", + "- [Running safety tests](#toc8__) \n", + " - [AspectCritic](#toc8_1_1__) \n", + " - [Bias](#toc8_1_2__) \n", + "- [Next steps](#toc9__) \n", + " - [Work with your model documentation](#toc9_1__) \n", + " - [Customize the banking agent for your use case](#toc9_2__) \n", + " - [Discover more learning resources](#toc9_3__) \n", + "- [Upgrade ValidMind](#toc10__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "ecaad35f", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models. \n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators." + ] + }, + { + "cell_type": "markdown", + "id": "6ff1f9ef", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html)." + ] + }, + { + "cell_type": "markdown", + "id": "d7ad8d8c", + "metadata": {}, + "source": [ + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
" + ] + }, + { + "cell_type": "markdown", + "id": "323caa59", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Metrics**: A subset of tests that do not have thresholds. In the context of this notebook, metrics and tests can be thought of as interchangeable concepts.\n", + "\n", + "**Custom metrics**: Custom metrics are functions that you define to evaluate your model or dataset. These functions can be registered with the ValidMind Library to be used in the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html))\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom metrics can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "id": "ddba5169", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "id": "b53da99c", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "
Recommended Python versions\n", + "

\n", + "Python 3.8 <= x <= 3.11
\n", + "\n", + "Let's begin by installing the ValidMind Library with large language model (LLM) support:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1982a118", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q \"validmind[llm]\" \"langgraph==0.3.21\"" + ] + }, + { + "cell_type": "markdown", + "id": "dc9dea3a", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "id": "5848461e", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook.\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "id": "97d0b04b", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Agentic AI`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "id": "b279d5fa", + "metadata": {}, + "source": [ + "
Can't select this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "3606cb8c", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6ccbefc", + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2ed79cf0", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Preview the documentation template\n", + "\n", + "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", + "\n", + "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dffdaa6f", + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "id": "b5c5ba68", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Verify OpenAI API access\n", + "\n", + "Verify that a valid `OPENAI_API_KEY` is set in your `.env` file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22cc39cb", + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables if using .env file\n", + "try:\n", + " from dotenv import load_dotenv\n", + " load_dotenv()\n", + "except ImportError:\n", + " print(\"dotenv not installed. Make sure OPENAI_API_KEY is set in your environment.\")" + ] + }, + { + "cell_type": "markdown", + "id": "e4a9d3a9", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the Python environment\n", + "\n", + "Let's import all the necessary libraries to prepare for building our banking LangGraph agentic system:\n", + "\n", + "- **Standard libraries** for data handling and environment management.\n", + "- **pandas**, a Python library for data manipulation and analytics, as an alias. We'll also configure pandas to show all columns and all rows at full width for easier debugging and inspection.\n", + "- **LangChain** components for LLM integration and tool management.\n", + "- **LangGraph** for building stateful, multi-step agent workflows.\n", + "- **Banking tools** for specialized financial services as defined in [banking_tools.py](banking_tools.py)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2058d1ac", + "metadata": {}, + "outputs": [], + "source": [ + "# STANDARD LIBRARY IMPORTS\n", + "\n", + "# TypedDict: Defines type-safe dictionaries for the agent's state structure\n", + "# Annotated: Adds metadata to type hints\n", + "# Sequence: Type hint for sequences used in the agent\n", + "from typing import TypedDict, Annotated, Sequence\n", + "\n", + "# THIRD PARTY IMPORTS\n", + "\n", + "import pandas as pd\n", + "# Configure pandas to show all columns and all rows at full width\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.width', None)\n", + "pd.set_option('display.max_rows', None)\n", + "\n", + "# BaseMessage: Represents a base message in the LangChain message system\n", + "# HumanMessage: Represents a human message in the LangChain message system\n", + "# SystemMessage: Represents a system message in the LangChain message system\n", + "from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage\n", + "\n", + "# ChatOpenAI: Represents an OpenAI chat model in the LangChain library\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "# MemorySaver: Represents a checkpoint for saving and restoring agent state\n", + "from langgraph.checkpoint.memory import MemorySaver\n", + "\n", + "# StateGraph: Represents a stateful graph in the LangGraph library\n", + "# END: Represents the end of a graph\n", + "# START: Represents the start of a graph\n", + "from langgraph.graph import StateGraph, END, START\n", + "\n", + "# add_messages: Adds messages to the state\n", + "from langgraph.graph.message import add_messages\n", + "\n", + "# ToolNode: Represents a tool node in the LangGraph library\n", + "from langgraph.prebuilt import ToolNode\n", + "\n", + "# LOCAL IMPORTS FROM banking_tools.py\n", + "\n", + "from banking_tools import AVAILABLE_TOOLS" + ] + }, + { + "cell_type": "markdown", + "id": "e109d075", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Building the LangGraph agent" + ] + }, + { + "cell_type": "markdown", + "id": "15040411", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Test available banking tools\n", + "\n", + "We'll use the demo banking tools defined in `banking_tools.py` that provide use cases of financial services:\n", + "\n", + "- **Credit Risk Analyzer** - Loan applications and credit decisions\n", + "- **Customer Account Manager** - Account services and customer support\n", + "- **Fraud Detection System** - Security and fraud prevention" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e0a120c", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Available tools: {len(AVAILABLE_TOOLS)}\")\n", + "print(\"\\nTool Details:\")\n", + "for i, tool in enumerate(AVAILABLE_TOOLS, 1):\n", + " print(f\" - {tool.name}\")" + ] + }, + { + "cell_type": "markdown", + "id": "04d6785a", + "metadata": {}, + "source": [ + "Let's test each banking tool individually to ensure they're working correctly before integrating them into our agent:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc0caff2", + "metadata": {}, + "outputs": [], + "source": [ + "# Test 1: Credit Risk Analyzer\n", + "print(\"TEST 1: Credit Risk Analyzer\")\n", + "print(\"-\" * 40)\n", + "try:\n", + " # Access the underlying function using .func\n", + " credit_result = AVAILABLE_TOOLS[0].func(\n", + " customer_income=75000,\n", + " customer_debt=1200,\n", + " credit_score=720,\n", + " loan_amount=50000,\n", + " loan_type=\"personal\"\n", + " )\n", + " print(credit_result)\n", + " print(\"Credit Risk Analyzer test PASSED\")\n", + "except Exception as e:\n", + " print(f\"Credit Risk Analyzer test FAILED: {e}\")\n", + "\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6b227db", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Test 2: Customer Account Manager\n", + "print(\"TEST 2: Customer Account Manager\")\n", + "print(\"-\" * 40)\n", + "try:\n", + " # Test checking balance\n", + " account_result = AVAILABLE_TOOLS[1].func(\n", + " account_type=\"checking\",\n", + " customer_id=\"12345\",\n", + " action=\"check_balance\"\n", + " )\n", + " print(account_result)\n", + "\n", + " # Test getting account info\n", + " info_result = AVAILABLE_TOOLS[1].func(\n", + " account_type=\"all\",\n", + " customer_id=\"12345\", \n", + " action=\"get_info\"\n", + " )\n", + " print(info_result)\n", + " print(\"Customer Account Manager test PASSED\")\n", + "except Exception as e:\n", + " print(f\"Customer Account Manager test FAILED: {e}\")\n", + "\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a983b30d", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Test 3: Fraud Detection System\n", + "print(\"TEST 3: Fraud Detection System\")\n", + "print(\"-\" * 40)\n", + "try:\n", + " fraud_result = AVAILABLE_TOOLS[2].func(\n", + " transaction_id=\"TX123\",\n", + " customer_id=\"12345\",\n", + " transaction_amount=500.00,\n", + " transaction_type=\"withdrawal\",\n", + " location=\"Miami, FL\",\n", + " device_id=\"DEVICE_001\"\n", + " )\n", + " print(fraud_result)\n", + " print(\"Fraud Detection System test PASSED\")\n", + "except Exception as e:\n", + " print(f\"Fraud Detection System test FAILED: {e}\")\n", + "\n", + "print(\"\" + \"=\" * 60)" + ] + }, + { + "cell_type": "markdown", + "id": "6bf04845", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Create LangGraph banking agent\n", + "\n", + "With our tools ready to go, we'll create our intelligent banking agent with LangGraph that automatically selects and uses the appropriate banking tool based on a user request." + ] + }, + { + "cell_type": "markdown", + "id": "31df57f0", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Define system prompt\n", + "\n", + "We'll begin by defining our system prompt, which provides the LLM with context about its role as a banking assistant and guidance on when to use each available tool:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7971c427", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Enhanced banking system prompt with tool selection guidance\n", + "system_context = \"\"\"You are a professional banking AI assistant with access to specialized banking tools.\n", + " Analyze the user's banking request and directly use the most appropriate tools to help them.\n", + " \n", + " AVAILABLE BANKING TOOLS:\n", + " \n", + " credit_risk_analyzer - Analyze credit risk for loan applications and credit decisions\n", + " - Use for: loan applications, credit assessments, risk analysis, mortgage eligibility\n", + " - Examples: \"Analyze credit risk for $50k personal loan\", \"Assess mortgage eligibility for $300k home purchase\"\n", + " - Parameters: customer_income, customer_debt, credit_score, loan_amount, loan_type\n", + "\n", + " customer_account_manager - Manage customer accounts and provide banking services\n", + " - Use for: account information, transaction processing, product recommendations, customer service\n", + " - Examples: \"Check balance for checking account 12345\", \"Recommend products for customer with high balance\"\n", + " - Parameters: account_type, customer_id, action, amount, account_details\n", + "\n", + " fraud_detection_system - Analyze transactions for potential fraud and security risks\n", + " - Use for: transaction monitoring, fraud prevention, risk assessment, security alerts\n", + " - Examples: \"Analyze fraud risk for $500 ATM withdrawal in Miami\", \"Check security for $2000 online purchase\"\n", + " - Parameters: transaction_id, customer_id, transaction_amount, transaction_type, location, device_id\n", + "\n", + " BANKING INSTRUCTIONS:\n", + " - Analyze the user's banking request carefully and identify the primary need\n", + " - If they need credit analysis → use credit_risk_analyzer\n", + " - If they need financial calculations → use financial_calculator\n", + " - If they need account services → use customer_account_manager\n", + " - If they need security analysis → use fraud_detection_system\n", + " - Extract relevant parameters from the user's request\n", + " - Provide helpful, accurate banking responses based on tool outputs\n", + " - Always consider banking regulations, risk management, and best practices\n", + " - Be professional and thorough in your analysis\n", + "\n", + " Choose and use tools wisely to provide the most helpful banking assistance.\n", + " Describe the response in user friendly manner with details describing the tool output. \n", + " Provide the response in at least 500 words.\n", + " Generate a concise execution plan for the banking request.\n", + " \"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "406835c8", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Initialize the LLM\n", + "\n", + "Let's initialize the LLM that will power our banking agent:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "866066e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the main LLM for banking responses\n", + "main_llm = ChatOpenAI(\n", + " model=\"gpt-5-mini\",\n", + " reasoning={\n", + " \"effort\": \"low\",\n", + " \"summary\": \"auto\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cce9685c", + "metadata": {}, + "source": [ + "Then bind the available banking tools to the LLM, enabling the model to automatically recognize and invoke each tool when appropriate based on request input and the system prompt we defined above:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "906d8132", + "metadata": {}, + "outputs": [], + "source": [ + "# Bind all banking tools to the main LLM\n", + "llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)" + ] + }, + { + "cell_type": "markdown", + "id": "2bad8799", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Define agent state structure\n", + "\n", + "The agent state defines the data structure that flows through the LangGraph workflow. It includes:\n", + "\n", + "- **messages** — The conversation history between the user and agent\n", + "- **user_input** — The current user request\n", + "- **session_id** — A unique identifier for the conversation session\n", + "- **context** — Additional context that can be passed between nodes\n", + "\n", + "Defining this state structure maintains the structure throughout the agent's execution and allows for multi-turn conversations with memory:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b926ddf", + "metadata": {}, + "outputs": [], + "source": [ + "# Banking Agent State Definition\n", + "class BankingAgentState(TypedDict):\n", + " messages: Annotated[Sequence[BaseMessage], add_messages]\n", + " user_input: str\n", + " session_id: str\n", + " context: dict" + ] + }, + { + "cell_type": "markdown", + "id": "47ce81b7", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Create agent workflow function\n", + "\n", + "We'll build the LangGraph agent workflow with two main components:\n", + "\n", + "1. **LLM node** — Processes user requests, applies the system prompt, and decides whether to use tools.\n", + "2. **Tools node** — Executes the selected banking tools when the LLM determines they're needed.\n", + "\n", + "The workflow begins with the LLM analyzing the request, then uses tools if needed — or ends if the response is complete, and finally returns to the LLM to generate the final response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c9bf585", + "metadata": {}, + "outputs": [], + "source": [ + "def create_banking_langgraph_agent():\n", + " \"\"\"Create a comprehensive LangGraph banking agent with intelligent tool selection.\"\"\"\n", + " def llm_node(state: BankingAgentState) -> BankingAgentState:\n", + " \"\"\"Main LLM node that processes banking requests and selects appropriate tools.\"\"\"\n", + " messages = state[\"messages\"]\n", + " # Add system context to messages\n", + " enhanced_messages = [SystemMessage(content=system_context)] + list(messages)\n", + " # Get LLM response with tool selection\n", + " response = llm_with_tools.invoke(enhanced_messages)\n", + " return {\n", + " **state,\n", + " \"messages\": messages + [response]\n", + " }\n", + " \n", + " def should_continue(state: BankingAgentState) -> str:\n", + " \"\"\"Decide whether to use tools or end the conversation.\"\"\"\n", + " last_message = state[\"messages\"][-1]\n", + " # Check if the LLM wants to use tools\n", + " if hasattr(last_message, 'tool_calls') and last_message.tool_calls:\n", + " return \"tools\"\n", + " return END\n", + " \n", + " # Create the banking state graph\n", + " workflow = StateGraph(BankingAgentState)\n", + " # Add nodes\n", + " workflow.add_node(\"llm\", llm_node)\n", + " workflow.add_node(\"tools\", ToolNode(AVAILABLE_TOOLS))\n", + " # Simplified entry point - go directly to LLM\n", + " workflow.add_edge(START, \"llm\")\n", + " # From LLM, decide whether to use tools or end\n", + " workflow.add_conditional_edges(\n", + " \"llm\",\n", + " should_continue,\n", + " {\"tools\": \"tools\", END: END}\n", + " )\n", + " # Tool execution flows back to LLM for final response\n", + " workflow.add_edge(\"tools\", \"llm\")\n", + " # Set up memory\n", + " memory = MemorySaver()\n", + " # Compile the graph\n", + " agent = workflow.compile(checkpointer=memory)\n", + " return agent" + ] + }, + { + "cell_type": "markdown", + "id": "3eb40287", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Instantiate the banking agent\n", + "\n", + "Now, we'll create an instance of the banking agent by calling the workflow creation function.\n", + "\n", + "This compiled agent is ready to process banking requests and will automatically select and use the appropriate tools based on user queries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "455b8ee4", + "metadata": {}, + "outputs": [], + "source": [ + "# Create the banking intelligent agent\n", + "banking_agent = create_banking_langgraph_agent()\n", + "\n", + "print(\"Banking LangGraph Agent Created Successfully!\")\n", + "print(\"\\nFeatures:\")\n", + "print(\" - Intelligent banking tool selection\")\n", + "print(\" - Comprehensive banking system prompt\")\n", + "print(\" - Streamlined workflow: LLM → Tools → Response\")\n", + "print(\" - Automatic tool parameter extraction\")\n", + "print(\" - Professional banking assistance\")" + ] + }, + { + "cell_type": "markdown", + "id": "12691528", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Integrate agent with ValidMind\n", + "\n", + "To integrate our LangGraph banking agent with ValidMind, we need to create a wrapper function that ValidMind can use to invoke the agent and extract the necessary information for testing and documentation, allowing ValidMind to run validation tests on the agent's behavior, tool usage, and responses." + ] + }, + { + "cell_type": "markdown", + "id": "7b78509b", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Import ValidMind components\n", + "\n", + "We'll start with importing the necessary ValidMind components for integrating our agent:\n", + "\n", + "- `Prompt` from `validmind.models` for handling prompt-based model inputs\n", + "- `extract_tool_calls_from_agent_output` and `_convert_to_tool_call_list` from `validmind.scorers.llm.deepeval` for extracting and converting tool calls from agent outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9aeb8969", + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.models import Prompt\n", + "from validmind.scorers.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list" + ] + }, + { + "cell_type": "markdown", + "id": "f67f2955", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Create agent wrapper function\n", + "\n", + "We'll then create a wrapper function that:\n", + "\n", + "- Accepts input in ValidMind's expected format (with `input` and `session_id` fields)\n", + "- Invokes the banking agent with the proper state initialization\n", + "- Captures tool outputs and tool calls for evaluation\n", + "- Returns a standardized response format that includes the prediction, full output, tool messages, and tool call information\n", + "- Handles errors gracefully with fallback responses" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e4d5a82", + "metadata": {}, + "outputs": [], + "source": [ + "def banking_agent_fn(input):\n", + " \"\"\"\n", + " Invoke the banking agent with the given input.\n", + " \"\"\"\n", + " try:\n", + " # Initial state for banking agent\n", + " initial_state = {\n", + " \"user_input\": input[\"input\"],\n", + " \"messages\": [HumanMessage(content=input[\"input\"])],\n", + " \"session_id\": input[\"session_id\"],\n", + " \"context\": {}\n", + " }\n", + " session_config = {\"configurable\": {\"thread_id\": input[\"session_id\"]}}\n", + " result = banking_agent.invoke(initial_state, config=session_config)\n", + "\n", + " from utils import capture_tool_output_messages\n", + "\n", + " # Capture all tool outputs and metadata\n", + " captured_data = capture_tool_output_messages(result)\n", + " \n", + " # Access specific tool outputs, this will be used for RAGAS tests\n", + " tool_message = \"\"\n", + " for output in captured_data[\"tool_outputs\"]:\n", + " tool_message += output['content']\n", + " \n", + " tool_calls_found = []\n", + " messages = result['messages']\n", + " for message in messages:\n", + " if hasattr(message, 'tool_calls') and message.tool_calls:\n", + " for tool_call in message.tool_calls:\n", + " # Handle both dictionary and object formats\n", + " if isinstance(tool_call, dict):\n", + " tool_calls_found.append(tool_call['name'])\n", + " else:\n", + " # ToolCall object - use attribute access\n", + " tool_calls_found.append(tool_call.name)\n", + "\n", + "\n", + " return {\n", + " \"prediction\": result['messages'][-1].content[0]['text'],\n", + " \"output\": result,\n", + " \"tool_messages\": [tool_message],\n", + " # \"tool_calls\": tool_calls_found,\n", + " \"tool_called\": _convert_to_tool_call_list(extract_tool_calls_from_agent_output(result))\n", + " }\n", + " except Exception as e:\n", + " # Return a fallback response if the agent fails\n", + " error_message = f\"\"\"I apologize, but I encountered an error while processing your banking request: {str(e)}.\n", + " Please try rephrasing your question or contact support if the issue persists.\"\"\"\n", + " return {\n", + " \"prediction\": error_message, \n", + " \"output\": {\n", + " \"messages\": [HumanMessage(content=input[\"input\"]), SystemMessage(content=error_message)],\n", + " \"error\": str(e)\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "4bdc90d6", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Initialize the ValidMind model object\n", + "\n", + "We'll also need to register the banking agent as a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data.\n", + "\n", + "You simply initialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model) that:\n", + "\n", + "- Associates the wrapper function with the model for prediction\n", + "- Stores the system prompt template for documentation\n", + "- Provides a unique `input_id` for tracking and identification\n", + "- Enables the agent to be used with ValidMind's testing and documentation features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60a2ce7a", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the agent as a model\n", + "vm_banking_model = vm.init_model(\n", + " input_id=\"banking_agent_model\",\n", + " predict_fn=banking_agent_fn,\n", + " prompt=Prompt(template=system_context)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "33ed446a", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Store the agent reference\n", + "\n", + "We'll also store a reference to the original banking agent object in the ValidMind model. This allows us to access the full agent functionality directly if needed, while still maintaining the wrapper function interface for ValidMind's testing framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c653471", + "metadata": {}, + "outputs": [], + "source": [ + "# Add the banking agent to the vm model\n", + "vm_banking_model.model = banking_agent" + ] + }, + { + "cell_type": "markdown", + "id": "bf44ea16", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Verify integration\n", + "\n", + "Let's confirm that the banking agent has been successfully integrated with ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e101b0f", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Banking Agent Successfully Integrated with ValidMind!\")\n", + "print(f\"Model ID: {vm_banking_model.input_id}\")" + ] + }, + { + "cell_type": "markdown", + "id": "0c80518d", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Validate the system prompt\n", + "\n", + "Let's get an initial sense of how well our defined system prompt meets a few best practices for prompt engineering by running a few tests — we'll run evaluation tests later on our agent's performance.\n", + "\n", + "You run individual tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module. Passing in our agentic model as an input, the tests below rate the prompt on a scale of 1-10 against the following criteria:\n", + "\n", + "- **[Clarity](https://docs.validmind.ai/tests/prompt_validation/Clarity.html)** — How clearly the prompt states the task.\n", + "- **[Conciseness](https://docs.validmind.ai/tests/prompt_validation/Conciseness.html)** — How succinctly the prompt states the task.\n", + "- **[Delimitation](https://docs.validmind.ai/tests/prompt_validation/Delimitation.html)** — When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", + "- **[NegativeInstruction](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — Whether the prompt contains negative instructions.\n", + "- **[Specificity](https://docs.validmind.ai/tests/prompt_validation/NegativeInstruction.html)** — How specific the prompt defines the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f52dceb1", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Clarity\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70d52333", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Conciseness\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aa89976", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Delimitation\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8630197e", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.NegativeInstruction\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bba99915", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Specificity\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "af4d6d77", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Initialize the ValidMind datasets\n", + "\n", + "After validation our system prompt, let's import our sample dataset ([banking_test_dataset.py](banking_test_dataset.py)), which we'll use in the next section to evaluate our agent's performance across different banking scenarios:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c70ca2c", + "metadata": {}, + "outputs": [], + "source": [ + "from banking_test_dataset import banking_test_dataset" + ] + }, + { + "cell_type": "markdown", + "id": "0268ce6e", + "metadata": {}, + "source": [ + "The next step is to connect your data with a ValidMind `Dataset` object. **This step is always necessary every time you want to connect a dataset to documentation and produce test results through ValidMind,** but you only need to do it once per dataset.\n", + "\n", + "Initialize a ValidMind dataset object using the [`init_dataset` function](https://docs.validmind.ai/validmind/validmind.html#init_dataset) from the ValidMind (`vm`) module. For this example, we'll pass in the following arguments:\n", + "\n", + "- **`input_id`** — A unique identifier that allows tracking what inputs are used when running each individual test.\n", + "- **`dataset`** — The raw dataset that you want to provide as input to tests.\n", + "- **`text_column`** — The name of the column containing the text input data.\n", + "- **`target_column`** — A required argument if tests require access to true values. This is the name of the target column in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7e9d158", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset = vm.init_dataset(\n", + " input_id=\"banking_test_dataset\",\n", + " dataset=banking_test_dataset,\n", + " text_column=\"input\",\n", + " target_column=\"possible_outputs\",\n", + ")\n", + "\n", + "print(\"Banking Test Dataset Initialized in ValidMind!\")\n", + "print(f\"Dataset ID: {vm_test_dataset.input_id}\")\n", + "print(f\"Dataset columns: {vm_test_dataset._df.columns}\")\n", + "vm_test_dataset._df" + ] + }, + { + "cell_type": "markdown", + "id": "b9143fb6", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign predictions\n", + "\n", + "Now that both the model object and the datasets have been registered, we'll assign predictions to capture the banking agent's responses for evaluation:\n", + "\n", + "- The [`assign_predictions()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#assign_predictions) from the `Dataset` object can link existing predictions to any number of models.\n", + "- This method links the model's class prediction values and probabilities to our `vm_train_ds` and `vm_test_ds` datasets.\n", + "\n", + "If no prediction values are passed, the method will compute predictions automatically:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d462663", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_predictions(vm_banking_model)\n", + "\n", + "print(\"Banking Agent Predictions Generated Successfully!\")\n", + "print(f\"Predictions assigned to {len(vm_test_dataset._df)} test cases\")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "8e50467e", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Running accuracy tests\n", + "\n", + "Using [`@vm.test`](https://docs.validmind.ai/validmind/validmind.html#test), let's implement some reusable custom *inline tests* to assess the accuracy of our banking agent:\n", + "\n", + "- An inline test refers to a test written and executed within the same environment as the code being tested — in this case, right in this Jupyter Notebook — without requiring a separate test file or framework.\n", + "- You'll note that the custom test functions are just regular Python functions that can include and require any Python library as you see fit." + ] + }, + { + "cell_type": "markdown", + "id": "6d8a9b90", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Response accuracy test\n", + "\n", + "We'll create a custom test that evaluates the banking agent's ability to provide accurate responses by:\n", + "\n", + "- Testing against a dataset of predefined banking questions and expected answers.\n", + "- Checking if responses contain expected keywords and banking terminology.\n", + "- Providing detailed test results including pass/fail status.\n", + "- Helping identify any gaps in the agent's banking knowledge or response quality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90232066", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "@vm.test(\"my_custom_tests.banking_accuracy_test\")\n", + "def banking_accuracy_test(model, dataset, list_of_columns):\n", + " \"\"\"\n", + " The Banking Accuracy Test evaluates whether the agent’s responses include \n", + " critical domain-specific keywords and phrases that indicate accurate, compliant,\n", + " and contextually appropriate banking information. This test ensures that the agent\n", + " provides responses containing the expected banking terminology, risk classifications,\n", + " account details, or other domain-relevant information required for regulatory compliance,\n", + " customer safety, and operational accuracy.\n", + " \"\"\"\n", + " df = dataset._df\n", + " \n", + " # Pre-compute responses for all tests\n", + " y_true = dataset.y.tolist()\n", + " y_pred = dataset.y_pred(model).tolist()\n", + "\n", + " # Vectorized test results\n", + " test_results = []\n", + " for response, keywords in zip(y_pred, y_true):\n", + " # Convert keywords to list if not already a list\n", + " if not isinstance(keywords, list):\n", + " keywords = [keywords]\n", + " test_results.append(any(str(keyword).lower() in str(response).lower() for keyword in keywords))\n", + " \n", + " results = pd.DataFrame()\n", + " column_names = [col + \"_details\" for col in list_of_columns]\n", + " results[column_names] = df[list_of_columns]\n", + " results[\"actual\"] = y_pred\n", + " results[\"expected\"] = y_true\n", + " results[\"passed\"] = test_results\n", + " results[\"error\"] = None if test_results else f'Response did not contain any expected keywords: {y_true}'\n", + " \n", + " return results" + ] + }, + { + "cell_type": "markdown", + "id": "7eed5265", + "metadata": {}, + "source": [ + "Now that we've defined our custom response accuracy test, we can run the test using the same `run_test()` function we used earlier to validate the system prompt using our sample dataset and agentic model as input, and log the test results to the ValidMind Platform with the [`log()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#log):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e68884d5", + "metadata": {}, + "outputs": [], + "source": [ + "result = vm.tests.run_test(\n", + " \"my_custom_tests.banking_accuracy_test\",\n", + " inputs={\n", + " \"dataset\": vm_test_dataset,\n", + " \"model\": vm_banking_model\n", + " },\n", + " params={\n", + " \"list_of_columns\": [\"input\"]\n", + " }\n", + ")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "id": "4d758ddf", + "metadata": {}, + "source": [ + "Let's review the first five rows of the test dataset to inspect the results to see how well the banking agent performed. Each column in the output serves a specific purpose in evaluating agent performance:\n", + "\n", + "| Column header | Description | Importance |\n", + "|--------------|-------------|------------|\n", + "| **`input`** | Original user query or request | Essential for understanding the context of each test case and tracing which inputs led to specific agent behaviors. |\n", + "| **`expected_tools`** | Banking tools that should be invoked for this request | Enables validation of correct tool selection, which is critical for agentic AI systems where choosing the right tool is a key success metric. |\n", + "| **`expected_output`** | Expected output or keywords that should appear in the response | Defines the success criteria for each test case, enabling objective evaluation of whether the agent produced the correct result. |\n", + "| **`session_id`** | Unique identifier for each test session | Allows tracking and correlation of related test runs, debugging specific sessions, and maintaining audit trails. |\n", + "| **`category`** | Classification of the request type | Helps organize test results by domain and identify performance patterns across different banking use cases. |\n", + "| **`banking_agent_model_output`** | Complete agent response including all messages and reasoning | Allows you to examine the full output to assess response quality, completeness, and correctness beyond just keyword matching. |\n", + "| **`banking_agent_model_tool_messages`** | Messages exchanged with the banking tools | Critical for understanding how the agent interacted with tools, what parameters were passed, and what tool outputs were received. |\n", + "| **`banking_agent_model_tool_called`** | Specific tool that was invoked | Enables validation that the agent selected the correct tool for each request, which is fundamental to agentic AI validation. |\n", + "| **`possible_outputs`** | Alternative valid outputs or keywords that could appear in the response | Provides flexibility in evaluation by accounting for multiple acceptable response formats or variations. |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78f7edb1", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.df.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "6f233bef", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Tool selection accuracy test\n", + "\n", + "We'll also create a custom test that evaluates the banking agent's ability to select the correct tools for different requests by:\n", + "\n", + "- Testing against a dataset of predefined banking queries with expected tool selections.\n", + "- Comparing the tools actually invoked by the agent against the expected tools for each request.\n", + "- Providing quantitative accuracy scores that measure the proportion of expected tools correctly selected.\n", + "- Helping identify gaps in the agent's understanding of user needs and tool selection logic." + ] + }, + { + "cell_type": "markdown", + "id": "d0b46111", + "metadata": {}, + "source": [ + "First, we'll define a helper function that extracts tool calls from the agent's messages and compares them against the expected tools. This function handles different message formats (dictionary or object) and calculates accuracy scores:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e68798be", + "metadata": {}, + "outputs": [], + "source": [ + "def validate_tool_calls_simple(messages, expected_tools):\n", + " \"\"\"Simple validation of tool calls without RAGAS dependency issues.\"\"\"\n", + " \n", + " tool_calls_found = []\n", + " \n", + " for message in messages:\n", + " if hasattr(message, 'tool_calls') and message.tool_calls:\n", + " for tool_call in message.tool_calls:\n", + " # Handle both dictionary and object formats\n", + " if isinstance(tool_call, dict):\n", + " tool_calls_found.append(tool_call['name'])\n", + " else:\n", + " # ToolCall object - use attribute access\n", + " tool_calls_found.append(tool_call.name)\n", + " \n", + " # Check if expected tools were called\n", + " accuracy = 0.0\n", + " matches = 0\n", + " if expected_tools:\n", + " matches = sum(1 for tool in expected_tools if tool in tool_calls_found)\n", + " accuracy = matches / len(expected_tools)\n", + " \n", + " return {\n", + " 'expected_tools': expected_tools,\n", + " 'found_tools': tool_calls_found,\n", + " 'matches': matches,\n", + " 'total_expected': len(expected_tools) if expected_tools else 0,\n", + " 'accuracy': accuracy,\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "1b45472c", + "metadata": {}, + "source": [ + "Now we'll define the main test function that uses the helper function to evaluate tool selection accuracy across all test cases in the dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "604d7313", + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.BankingToolCallAccuracy\")\n", + "def BankingToolCallAccuracy(dataset, agent_output_column, expected_tools_column):\n", + " \"\"\"\n", + " Evaluates the tool selection accuracy of a LangGraph-powered banking agent.\n", + "\n", + " This test measures whether the agent correctly identifies and invokes the required banking tools\n", + " for each user query scenario.\n", + " For each case, the outputs generated by the agent (including its tool calls) are compared against an\n", + " expected set of tools. The test considers both coverage and exactness: it computes the proportion of\n", + " expected tools correctly called by the agent for each instance.\n", + "\n", + " Parameters:\n", + " dataset (VMDataset): The dataset containing user queries, agent outputs, and ground-truth tool expectations.\n", + " agent_output_column (str): Dataset column name containing agent outputs (should include tool call details in 'messages').\n", + " expected_tools_column (str): Dataset column specifying the true expected tools (as lists).\n", + "\n", + " Returns:\n", + " List[dict]: Per-row dictionaries with details: expected tools, found tools, match count, total expected, and accuracy score.\n", + "\n", + " Purpose:\n", + " Provides diagnostic evidence of the banking agent's core reasoning ability—specifically, its capacity to\n", + " interpret user needs and select the correct banking actions. Useful for diagnosing gaps in tool coverage,\n", + " misclassifications, or breakdowns in agent logic.\n", + "\n", + " Interpretation:\n", + " - An accuracy of 1.0 signals perfect tool selection for that example.\n", + " - Lower scores may indicate partial or complete failures to invoke required tools.\n", + " - Review 'found_tools' vs. 'expected_tools' to understand the source of discrepancies.\n", + "\n", + " Strengths:\n", + " - Directly tests a core capability of compositional tool-use agents.\n", + " - Framework-agnostic; robust to tool call output format (object or dict).\n", + " - Supports batch validation and result logging for systematic documentation.\n", + "\n", + " Limitations:\n", + " - Does not penalize extra, unnecessary tool calls.\n", + " - Does not assess result quality—only correct invocation.\n", + "\n", + " \"\"\"\n", + " df = dataset._df\n", + " \n", + " results = []\n", + " for i, row in df.iterrows():\n", + " result = validate_tool_calls_simple(row[agent_output_column]['messages'], row[expected_tools_column])\n", + " results.append(result)\n", + " \n", + " return results" + ] + }, + { + "cell_type": "markdown", + "id": "d594c973", + "metadata": {}, + "source": [ + "Finally, we can call our function with `run_test()` and log the test results to the ValidMind Platform:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd14115e", + "metadata": {}, + "outputs": [], + "source": [ + "result = vm.tests.run_test(\n", + " \"my_custom_tests.BankingToolCallAccuracy\",\n", + " inputs={\n", + " \"dataset\": vm_test_dataset,\n", + " },\n", + " params={\n", + " \"agent_output_column\": \"banking_agent_model_output\",\n", + " \"expected_tools_column\": \"expected_tools\"\n", + " }\n", + ")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "id": "f78f4107", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Assigning AI evaluation metric scores\n", + "\n", + "*AI agent evaluation metrics* are specialized measurements designed to assess how well autonomous LLM-based agents reason, plan, select and execute tools, and ultimately complete user tasks by analyzing the *full execution trace* — including reasoning steps, tool calls, intermediate decisions, and outcomes, rather than just single input–output pairs. These metrics are essential because agent failures often occur in ways traditional LLM metrics miss — for example, choosing the right tool with wrong arguments, creating a good plan but not following it, or completing a task inefficiently.\n", + "\n", + "In this section, we'll evaluate our banking agent's outputs and add scoring to our sample dataset against metrics defined in [DeepEval’s AI agent evaluation framework](https://deepeval.com/guides/guides-ai-agent-evaluation-metrics) which breaks down AI agent evaluation into three layers with corresponding subcategories: **reasoning**, **action**, and **execution**.\n", + "\n", + "Together, these three metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." + ] + }, + { + "cell_type": "markdown", + "id": "3a9c853a", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Identify relevant DeepEval scorers\n", + "\n", + "*Scorers* are evaluation metrics that analyze model outputs and store their results in the dataset:\n", + "\n", + "- Each scorer adds a new column to the dataset with format: `{scorer_name}_{metric_name}`\n", + "- The column contains the numeric score (typically `0`-`1`) for each example\n", + "- Multiple scorers can be run on the same dataset, each adding their own column\n", + "- Scores are persisted in the dataset for later analysis and visualization\n", + "- Common scorer patterns include:\n", + " - Model performance metrics (accuracy, F1, etc.)\n", + " - Output quality metrics (relevance, faithfulness)\n", + " - Task-specific metrics (completion, correctness)\n", + "\n", + "Use `list_scorers()` from [`validmind.scorers`](https://docs.validmind.ai/validmind/validmind/tests.html#scorer) to discover all available scoring methods and their IDs that can be used with `assign_scores()`. We'll filter these results to return only DeepEval scorers for our desired three metrics in a formatted table with descriptions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "730c70ec", + "metadata": {}, + "outputs": [], + "source": [ + "# Load all DeepEval scorers\n", + "llm_scorers_dict = vm.tests.load._load_tests([s for s in vm.scorer.list_scorers() if \"deepeval\" in s.lower()])\n", + "\n", + "# Categorize scorers by metric layer\n", + "reasoning_scorers = {}\n", + "action_scorers = {}\n", + "execution_scorers = {}\n", + "\n", + "for scorer_id, scorer_func in llm_scorers_dict.items():\n", + " tags = getattr(scorer_func, \"__tags__\", [])\n", + " scorer_name = scorer_id.split(\".\")[-1]\n", + "\n", + " if \"reasoning_layer\" in tags:\n", + " reasoning_scorers[scorer_id] = scorer_func\n", + " elif \"action_layer\" in tags:\n", + " action_scorers[scorer_id] = scorer_func\n", + " elif \"TaskCompletion\" in scorer_name:\n", + " execution_scorers[scorer_id] = scorer_func\n", + "\n", + "# Display scorers by category\n", + "print(\"=\" * 80)\n", + "print(\"REASONING LAYER\")\n", + "print(\"=\" * 80)\n", + "if reasoning_scorers:\n", + " reasoning_df = vm.tests.load._pretty_list_tests(reasoning_scorers, truncate=True)\n", + " display(reasoning_df)\n", + "else:\n", + " print(\"No reasoning layer scorers found.\")\n", + "\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\"ACTION LAYER\")\n", + "print(\"=\" * 80)\n", + "if action_scorers:\n", + " action_df = vm.tests.load._pretty_list_tests(action_scorers, truncate=True)\n", + " display(action_df)\n", + "else:\n", + " print(\"No action layer scorers found.\")\n", + "\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\"EXECUTION LAYER\")\n", + "print(\"=\" * 80)\n", + "if execution_scorers:\n", + " execution_df = vm.tests.load._pretty_list_tests(execution_scorers, truncate=True)\n", + " display(execution_df)\n", + "else:\n", + " print(\"No execution layer scorers found.\")" + ] + }, + { + "cell_type": "markdown", + "id": "4dd73d0d", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign reasoning scores\n", + "\n", + "*Reasoning* evaluates planning and strategy generation:\n", + "\n", + "- **Plan quality** – How logical, complete, and efficient the agent’s plan is.\n", + "- **Plan adherence** – Whether the agent follows its own plan during execution." + ] + }, + { + "cell_type": "markdown", + "id": "06ccae28", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Plan quality score\n", + "\n", + "Let's measure how well our banking agent generates a plan before acting. A high score means the plan is logical, complete, and efficient." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52f362ba", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.PlanQuality\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "8dcdc88f", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Plan adherence score\n", + "\n", + "Let's check whether our banking agent follows the plan it created. Deviations lower this score and indicate gaps between reasoning and execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4124a7c2", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.PlanAdherence\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " expected_output_column = \"expected_output\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "6da1ac95", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign action scores\n", + "\n", + "*Action* assesses tool usage and argument generation:\n", + "\n", + "- **Tool correctness** – Whether the agent selects and calls the right tools.\n", + "- **Argument correctness** – Whether the agent generates correct tool arguments." + ] + }, + { + "cell_type": "markdown", + "id": "d4db8270", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Tool correctness score\n", + "\n", + "Let's evaluate if our banking agent selects the appropriate tool for the task. Choosing the wrong tool reduces performance even if reasoning was correct." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d2e8a25", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.ToolCorrectness\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " expected_tools_column = \"expected_tools\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "9aa50b05", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Argument correctness score\n", + "\n", + "Let's assesses whether our banking agent provides correct inputs or arguments to the selected tool. Incorrect arguments can lead to failed or unexpected results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04f90489", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.ArgumentCorrectness\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c59e5595", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign execution score\n", + "\n", + "*Execution* measures end-to-end performance:\n", + "\n", + "- **Task completion** – Whether the agent successfully completes the intended task.\n" + ] + }, + { + "cell_type": "markdown", + "id": "d64600ca", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Task completion score\n", + "\n", + "Let's evaluate whether our banking agent successfully completes the requested tasks. Incomplete task execution can lead to user dissatisfaction and failed banking operations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05024f1f", + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorers.llm.deepeval.TaskCompletion\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "21aa9b0d", + "metadata": {}, + "source": [ + "As you recall from the beginning of this section, when we run scorers through `assign_scores()`, the return values are automatically processed and added as new columns with the format `{scorer_name}_{metric_name}`. Note that the task completion scorer has added a new column `TaskCompletion_score` to our dataset.\n", + "\n", + "We'll use this column to visualize the distribution of task completion scores across our test cases through the [BoxPlot test](https://docs.validmind.ai/validmind/validmind/tests/plots/BoxPlot.html#boxplot):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f6d08ca", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.plots.BoxPlot\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " params={\n", + " \"columns\": \"TaskCompletion_score\",\n", + " \"title\": \"Distribution of Task Completion Scores\",\n", + " \"ylabel\": \"Score\",\n", + " \"figsize\": (8, 6)\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "012bbcb8", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Running RAGAS tests\n", + "\n", + "Next, let's run some out-of-the-box *Retrieval-Augmented Generation Assessment* (RAGAS) tests available in the ValidMind Library. RAGAS provides specialized metrics for evaluating retrieval-augmented generation systems and conversational AI agents. These metrics analyze different aspects of agent performance by assessing how well systems integrate retrieved information with generated responses.\n", + "\n", + "Our banking agent uses tools to retrieve information and generates responses based on that context, making it similar to a RAG system. RAGAS metrics help evaluate the quality of this integration by analyzing the relationship between retrieved tool outputs, user queries, and generated responses.\n", + "\n", + "These tests provide insights into how well our banking agent integrates tool usage with conversational abilities, ensuring it provides accurate, relevant, and helpful responses to banking users while maintaining fidelity to retrieved information." + ] + }, + { + "cell_type": "markdown", + "id": "2036afba", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Identify relevant RAGAS tests\n", + "\n", + "Let's explore some of ValidMind's available tests. Using ValidMind’s repository of tests streamlines your development testing, and helps you ensure that your models are being documented and evaluated appropriately.\n", + "\n", + "You can pass `tasks` and `tags` as parameters to the [`vm.tests.list_tests()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tests) to filter the tests based on the tags and task types:\n", + "\n", + "- **`tasks`** represent the kind of modeling task associated with a test. Here we'll focus on `text_qa` tasks.\n", + "- **`tags`** are free-form descriptions providing more details about the test, for example, what category the test falls into. Here we'll focus on the `ragas` tag.\n", + "\n", + "We'll then run three of these tests returned as examples below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0701f5a9", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.list_tests(task=\"text_qa\", tags=[\"ragas\"])" + ] + }, + { + "cell_type": "markdown", + "id": "c1741ffc", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Faithfulness\n", + "\n", + "Let's evaluate whether the banking agent's responses accurately reflect the information retrieved from tools. Unfaithful responses can misreport credit analysis, financial calculations, and compliance results—undermining user trust in the banking agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92044533", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.ragas.Faithfulness\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " param_grid={\n", + " \"user_input_column\": [\"input\"],\n", + " \"response_column\": [\"banking_agent_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "42b71ccc", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Response Relevancy\n", + "\n", + "Let's evaluate whether the banking agent's answers address the user's original question or request. Irrelevant or off-topic responses can frustrate users and fail to deliver the banking information they need." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7483bc3", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " params={\n", + " \"user_input_column\": \"input\",\n", + " \"response_column\": \"banking_agent_model_prediction\",\n", + " \"retrieved_contexts_column\": \"banking_agent_model_tool_messages\",\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "4f4d0569", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Context Recall\n", + "\n", + "Let's evaluate how well the banking agent uses the information retrieved from tools when generating its responses. Poor context recall can lead to incomplete or underinformed answers even when the right tools were selected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5dc00ce", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.ragas.ContextRecall\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " param_grid={\n", + " \"user_input_column\": [\"input\"],\n", + " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", + " \"reference_column\": [\"banking_agent_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "b987b00e", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Running safety tests\n", + "\n", + "Finally, let's run some out-of-the-box *safety* tests available in the ValidMind Library. Safety tests provide specialized metrics for evaluating whether AI agents operate reliably and securely. These metrics analyze different aspects of agent behavior by assessing adherence to safety guidelines, consistency of outputs, and resistance to harmful or inappropriate requests.\n", + "\n", + "Our banking agent handles sensitive financial information and user requests, making safety and reliability essential. Safety tests help evaluate whether the agent maintains appropriate boundaries, responds consistently and correctly to inputs, and avoids generating harmful, biased, or unprofessional content.\n", + "\n", + "These tests provide insights into how well our banking agent upholds standards of fairness and professionalism, ensuring it operates reliably and securely for banking users." + ] + }, + { + "cell_type": "markdown", + "id": "a754cca3", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### AspectCritic\n", + "\n", + "Let's evaluate our banking agent's responses across multiple quality dimensions — conciseness, coherence, correctness, harmfulness, and maliciousness. Weak performance on these dimensions can degrade user experience, fall short of professional banking standards, or introduce safety risks. \n", + "\n", + "We'll use the `AspectCritic` we identified earlier:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "148daa2b", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.ragas.AspectCritic\",\n", + " inputs={\"dataset\": vm_test_dataset},\n", + " param_grid={\n", + " \"user_input_column\": [\"input\"],\n", + " \"response_column\": [\"banking_agent_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"banking_agent_model_tool_messages\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "92e5b1f6", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Bias\n", + "\n", + "Let's evaluate whether our banking agent's prompts contain unintended biases that could affect banking decisions. Biased prompts can lead to unfair or discriminatory outcomes — undermining customer trust and exposing the institution to compliance risk.\n", + "\n", + "We'll first use `list_tests()` again to filter for tests relating to `prompt_validation`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74eba86c", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.list_tests(filter=\"prompt_validation\")" + ] + }, + { + "cell_type": "markdown", + "id": "bcc66b65", + "metadata": {}, + "source": [ + "And then run the identified `Bias` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "062cf8e7", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.prompt_validation.Bias\",\n", + " inputs={\n", + " \"model\": vm_banking_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "a2832750", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps\n", + "\n", + "You can look at the output produced by the ValidMind Library right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation." + ] + }, + { + "cell_type": "markdown", + "id": "a8cb1a58", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Work with your model documentation\n", + "\n", + "1. From the **Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", + "\n", + "2. In the left sidebar that appears for your model, click **Documentation** under Documents.\n", + "\n", + " What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/working-with-model-documentation.html)\n", + "\n", + "3. Click into any section related to the tests we ran in this notebook, for example: **4.3. Prompt Evaluation** to review the results of the tests we logged." + ] + }, + { + "cell_type": "markdown", + "id": "94ef26be", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Customize the banking agent for your use case\n", + "\n", + "You've now built an agentic AI system designed for banking use cases that supports compliance with supervisory guidance such as SR 11-7 and SS1/23, covering credit and fraud risk assessment for both retail and commercial banking. Extend this example agent to real-world banking scenarios and production deployment by:\n", + "\n", + "- Adapting the banking tools to your organization's specific requirements\n", + "- Adding more banking scenarios and edge cases to your test set\n", + "- Connecting the agent to your banking systems and databases\n", + "- Implementing additional banking-specific tools and workflows" + ] + }, + { + "cell_type": "markdown", + "id": "a681e49c", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Discover more learning resources\n", + "\n", + "Learn more about the ValidMind Library tools we used in this notebook:\n", + "\n", + "- [Custom prompts](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/customize_test_result_descriptions.html)\n", + "- [Custom tests](https://docs.validmind.ai/notebooks/how_to/tests/custom_tests/implement_custom_tests.html)\n", + "- [ValidMind scorers](https://docs.validmind.ai/notebooks/how_to/scoring/assign_scores_complete_tutorial.html)\n", + "\n", + "We also offer many more interactive notebooks to help you document models:\n", + "\n", + "- [Run tests & test suites](https://docs.validmind.ai/guide/testing-overview.html)\n", + "- [Code samples](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "id": "707c1b6e", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9733adff", + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "id": "e4b0b646", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "387fa7f1", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-de4baf0f42ba4a37946d52586dff1049", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-1QuffXMV-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/site/notebooks/code_samples/agents/utils.py b/site/notebooks/use_cases/agents/utils.py similarity index 100% rename from site/notebooks/code_samples/agents/utils.py rename to site/notebooks/use_cases/agents/utils.py diff --git a/site/notebooks/use_cases/capital_markets/capital_markets_template.yaml b/site/notebooks/use_cases/capital_markets/capital_markets_template.yaml new file mode 100644 index 0000000000..9cb561dc27 --- /dev/null +++ b/site/notebooks/use_cases/capital_markets/capital_markets_template.yaml @@ -0,0 +1,227 @@ +# Copyright © 2023-2026 ValidMind Inc. All rights reserved. +# Refer to the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial + +- id: model_metadata + title: Model Metadata and Stakeholders + index_only: true + sections: + - id: metadata + title: Metadata + parent_section: model_metadata + guidelines: + - Provide detailed metadata to uniquely identify the model, ensuring + that each entry is traceable and consistent with internal records. + - Specify the platform to enable validation and governance teams to + understand technical dependencies. + - Metadata should be aligned with enterprise systems (e.g., Model Risk + Management System). + - Be specific about versioning and any platform customizations. + - id: stakeholders + title: Stakeholders + parent_section: model_metadata + guidelines: + - List all individuals responsible for the development, use, and + oversight of the model. + - Ensure clarity in responsibilities to avoid overlap and gaps in + accountability. + - Include specific roles, such as "Data Scientist," "Validation Lead," + or "Business Analyst," and describe their responsibilities relative to + the model. +- id: business_context + title: Business Context and Purpose + index_only: true + sections: + - id: business_problem + title: Business Problem and Objectives + parent_section: business_context + guidelines: + - Clearly describe the problem the model addresses and its alignment + with business goals. + - Include specific use cases, outputs, and highlight regulatory + expectations to demonstrate compliance. + - Specify compliance requirements, such as IFRS, Basel III or SR11-7, as + applicable. + - id: products_and_risks + title: Products and Risks + parent_section: business_context + guidelines: + - Detail how the model impacts products or processes and specify + associated risks (e.g., market, credit, or operational risks). + - Provide a description of business impacts and compliance with + applicable regulations. +- id: model_design + title: Model Methodology and Design + index_only: true + sections: + - id: theoretical_foundations + title: Theoretical Foundations + parent_section: model_design + guidelines: + - Explain the methodology, assumptions, and logic underlying the model. + - Clearly document how key risks (e.g., credit default or liquidity) are + addressed within the model. + - Highlight any simplifications or approximations and their implications. + - id: architecture + title: Model Architecture + parent_section: model_design + guidelines: + - Use diagrams to illustrate the model's structure and data flow. + - Include visual flowcharts detailing how inputs are transformed into + outputs. + - Highlight dependencies affecting validation or performance. + - id: selection + title: Model Selection and Justification + parent_section: model_design + guidelines: + - Document the decision-making process, including comparisons to + alternative approaches. + - Provide rationale for model selection using performance metrics like + R-squared or RMSE. + - Highlight reasons for rejecting alternatives. +- id: model_data + title: Model Data + index_only: true + sections: + - id: input_data + title: Input Data + parent_section: model_data + guidelines: + - Detail the provenance and quality of input data, including + preprocessing steps like imputation or outlier detection. + - Highlight known issues, such as stale data or incomplete time series. + - Suggest standard tests, e.g., null value checks, distribution + matching, or correlation analysis. + - id: dataset_characteristics + title: Development Dataset + parent_section: model_data + guidelines: + - Summarize dataset characteristics like size, representativeness, and + scope. + - Include validation metrics such as sampling error or coverage ratios. + - id: outputs + title: Outputs + parent_section: model_data + guidelines: + - Define outputs, their usage, and storage mechanisms. + - Highlight data formats (e.g., APIs, flat files) and ensure outputs are + validated for consistency and accuracy. +- id: model_testing + title: Model Testing + index_only: true + sections: + - id: diagnostic_testing + title: Diagnostic Testing + parent_section: model_testing + guidelines: + - Provide details of diagnostic tests performed to ensure model + performance and identify anomalies. + - Include standard diagnostic tests such as - Residual analysis to check + model predictions. - Comparison of predicted versus actual outcomes + for validity. + - Summarize findings and highlight any performance gaps. + - id: sensitivity_stress_testing + title: Sensitivity and Stress Testing + parent_section: model_testing + guidelines: + - Sensitivity Testing. Describe how changes in input variables affect + model outputs. + - Stress Testing. Document model performance under extreme conditions or + assumptions. + - Use tests to measure how minor changes in key parameters affect + results. + - Simulate scenarios like extreme economic downturns to evaluate + robustness. + - id: performance_testing + title: Performance Testing + parent_section: model_testing + guidelines: + - Provide details of performance metrics such as RMSE, AUC, or + precision/recall. + - Benchmark comparisons. Compare performance with industry standards or + alternative models. + - Include visual aids like ROC curves or confusion matrices to + illustrate performance. + - id: back_testing + title: Back-Testing + parent_section: model_testing + guidelines: + - Highlight alignment of predictions with observed outcomes through + historical analysis. + - Test model predictions against historical data outcomes. + - Document discrepancies and propose remediation steps. +- id: implementation + title: Model Implementation + index_only: true + sections: + - id: production_environment + title: Production Environment + parent_section: implementation + guidelines: + - Describe the implementation environment, such as cloud platforms like + AWS or GCP. + - Highlight integration points, such as database connectors or REST + APIs, ensuring consistency with design specifications. + - id: implementation_testing + title: Implementation Testing + parent_section: implementation + guidelines: + - Document verification steps, including parallel runs against legacy + systems and end-to-end pipeline testing. +- id: limitations_adjustments + title: Assumptions, Limitations, and Adjustments + index_only: true + sections: + - id: assumptions + title: Assumptions + parent_section: limitations_adjustments + guidelines: + - List assumptions critical to model functionality and provide + justifications. + - Include potential impact analyses if assumptions fail. + - id: limitations + title: Limitations + parent_section: limitations_adjustments + guidelines: + - Highlight known limitations and mitigation strategies. + - Discuss implications for model performance or reliability. + - id: adjustments + title: Adjustments + parent_section: limitations_adjustments + guidelines: + - Document overrides and their justification, including governance + processes. +- id: monitoring_controls_documentation + title: Model Monitoring and Controls + index_only: true + sections: + - id: ongoing + title: Ongoing Model Monitoring + parent_section: monitoring_controls_documentation + guidelines: + - Outline a monitoring plan, including performance metrics, monitoring + frequency, and escalation thresholds. + - Include drift analysis of input data distributions and stability + metrics for output consistency. + - id: governance + title: Governance + parent_section: monitoring_controls_documentation + guidelines: + - Define access controls, version management, and governance frameworks. + - Highlight periodic audits and role-based access controls. +- id: documentation_references + title: Documentation References + index_only: true + sections: + - id: supporting_documents + title: Supporting Documents + parent_section: documentation_references + guidelines: + - Provide references to related documentation, such as validation + reports. + - Maintain a change log for systematic traceability. + - id: appendices + title: Appendices + parent_section: documentation_references + guidelines: + - Use appendices for supplementary data, testing results, and glossaries. diff --git a/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models.ipynb b/site/notebooks/use_cases/capital_markets/quickstart_option_pricing_models.ipynb similarity index 99% rename from site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models.ipynb rename to site/notebooks/use_cases/capital_markets/quickstart_option_pricing_models.ipynb index 6953959242..a218950c56 100644 --- a/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models.ipynb +++ b/site/notebooks/use_cases/capital_markets/quickstart_option_pricing_models.ipynb @@ -117,7 +117,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", diff --git a/site/notebooks/use_cases/capital_markets/quickstart_option_pricing_models_quantlib.ipynb b/site/notebooks/use_cases/capital_markets/quickstart_option_pricing_models_quantlib.ipynb new file mode 100644 index 0000000000..de541b448a --- /dev/null +++ b/site/notebooks/use_cases/capital_markets/quickstart_option_pricing_models_quantlib.ipynb @@ -0,0 +1,1345 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1e2a4689", + "metadata": {}, + "source": [ + "# Quickstart for Heston option pricing model using QuantLib\n", + "\n", + "Welcome! Let's get you started with the basic process of documenting models with ValidMind.\n", + "\n", + "The Heston option pricing model is a popular stochastic volatility model used to price options. Developed by Steven Heston in 1993, the model assumes that the asset's volatility follows a mean-reverting square-root process, allowing it to capture the empirical observation of volatility \"clustering\" in financial markets. This model is particularly useful for assets where volatility is not constant, making it a favored approach in quantitative finance for pricing complex derivatives.\n", + "\n", + "Here’s an overview of the Heston model as implemented in QuantLib, a powerful library for quantitative finance:\n", + "\n", + "\n", + "\n", + "### Model Assumptions and Characteristics\n", + "1. **Stochastic Volatility**: The volatility is modeled as a stochastic process, following a mean-reverting square-root process (Cox-Ingersoll-Ross process).\n", + "2. **Correlated Asset and Volatility Processes**: The asset price and volatility are assumed to be correlated, allowing the model to capture the \"smile\" effect observed in implied volatilities.\n", + "3. **Risk-Neutral Dynamics**: The Heston model is typically calibrated under a risk-neutral measure, which allows for direct application to pricing.\n", + "\n", + "\n", + "\n", + "### Heston Model Parameters\n", + "The model is governed by a set of key parameters:\n", + "- **S0**: Initial stock price\n", + "- **v0**: Initial variance of the asset price\n", + "- **kappa**: Speed of mean reversion of the variance\n", + "- **theta**: Long-term mean level of variance\n", + "- **sigma**: Volatility of volatility (vol of vol)\n", + "- **rho**: Correlation between the asset price and variance processes\n", + "\n", + "The dynamics of the asset price \\( S \\) and the variance \\( v \\) under the Heston model are given by:\n", + "\n", + "$$\n", + "dS_t = r S_t \\, dt + \\sqrt{v_t} S_t \\, dW^S_t\n", + "$$\n", + "\n", + "$$\n", + "dv_t = \\kappa (\\theta - v_t) \\, dt + \\sigma \\sqrt{v_t} \\, dW^v_t\n", + "$$\n", + "\n", + "where \\( $dW^S$ \\) and \\( $dW^v$ \\) are Wiener processes with correlation \\( $\\rho$ \\).\n", + "\n", + "\n", + "\n", + "### Advantages and Limitations\n", + "- **Advantages**:\n", + " - Ability to capture volatility smiles and skews.\n", + " - More realistic pricing for options on assets with stochastic volatility.\n", + "- **Limitations**:\n", + " - Calibration can be complex due to the number of parameters.\n", + " - Computationally intensive compared to simpler models like Black-Scholes.\n", + "\n", + "This setup provides a robust framework for pricing and analyzing options with stochastic volatility dynamics. QuantLib’s implementation makes it easy to experiment with different parameter configurations and observe their effects on pricing.\n", + "\n", + "You will learn how to initialize the ValidMind Library, develop a option pricing model, and then write custom tests that can be used for sensitivity and stress testing to quickly generate documentation about model." + ] + }, + { + "cell_type": "markdown", + "id": "69ec219a", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + " - [Model Assumptions and Characteristics](#toc1_1__) \n", + " - [Heston Model Parameters](#toc1_2__) \n", + " - [Advantages and Limitations](#toc1_3__) \n", + "- [About ValidMind](#toc2__) \n", + " - [Before you begin](#toc2_1__) \n", + " - [New to ValidMind?](#toc2_2__) \n", + " - [Key concepts](#toc2_3__) \n", + "- [Setting up](#toc3__) \n", + " - [Install the ValidMind Library](#toc3_1__) \n", + " - [Initialize the ValidMind Library](#toc3_2__) \n", + " - [Register sample model](#toc3_2_1__) \n", + " - [Apply documentation template](#toc3_2_2__) \n", + " - [Get your code snippet](#toc3_2_3__) \n", + " - [Initialize the Python environment](#toc3_3__) \n", + " - [Preview the documentation template](#toc3_4__) \n", + "- [Data Preparation](#toc4__) \n", + " - [Helper functions](#toc4_1_1__) \n", + " - [Market Data Quality and Availability](#toc4_2__) \n", + " - [Initialize the ValidMind datasets](#toc4_3__) \n", + " - [Data Quality](#toc4_4__) \n", + " - [Isolation Forest Outliers Test](#toc4_4_1__) \n", + " - [Model parameters](#toc4_4_2__) \n", + "- [Model development - Heston Option price](#toc5__) \n", + " - [Model Calibration](#toc5_1__) \n", + " - [Model Evaluation](#toc5_2__) \n", + " - [Benchmark Testing](#toc5_2_1__) \n", + " - [Sensitivity Testing](#toc5_2_2__) \n", + " - [Stress Testing](#toc5_2_3__) \n", + "- [Next steps](#toc6__) \n", + " - [Work with your model documentation](#toc6_1__) \n", + " - [Discover more learning resources](#toc6_2__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "b9fb5d17", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "id": "f2dccf35", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "id": "5a5ce085", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "409352bf", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "id": "65e870b2", + "metadata": {}, + "source": [ + "To install the QuantLib library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a34debf", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q QuantLib" + ] + }, + { + "cell_type": "markdown", + "id": "fb30ae07", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "id": "c6f87017", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "id": "cbb2e2c9", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Capital Markets`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "id": "41c4edca", + "metadata": {}, + "source": [ + "
Can't select this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "2012eb82", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cd3f67e", + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "6d944cc9", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the Python environment\n", + "\n", + "Next, let's import the necessary libraries and set up your Python environment for data analysis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8cf2746", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from scipy.optimize import minimize\n", + "import yfinance as yf\n", + "import QuantLib as ql\n", + "from validmind.tests import run_test" + ] + }, + { + "cell_type": "markdown", + "id": "bc431ee0", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Preview the documentation template\n", + "\n", + "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", + "\n", + "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e844028", + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "id": "0c0ee8b9", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Data Preparation" + ] + }, + { + "cell_type": "markdown", + "id": "5a4d2c36", + "metadata": {}, + "source": [ + "### Market Data Sources\n", + "\n", + "\n", + "\n", + "#### Helper functions\n", + "Let's define helper function retrieve to option data from Yahoo Finance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b96a500f", + "metadata": {}, + "outputs": [], + "source": [ + "def get_market_data(ticker, expiration_date_str):\n", + " \"\"\"\n", + " Fetch option market data from Yahoo Finance for the given ticker and expiration date.\n", + " Returns a list of tuples: (strike, maturity, option_price).\n", + " \"\"\"\n", + " # Create a Ticker object for the specified stock\n", + " stock = yf.Ticker(ticker)\n", + "\n", + " # Get all available expiration dates for options\n", + " option_dates = stock.options\n", + "\n", + " # Check if the requested expiration date is available\n", + " if expiration_date_str not in option_dates:\n", + " raise ValueError(f\"Expiration date {expiration_date_str} not available for {ticker}. Available dates: {option_dates}\")\n", + "\n", + " # Get the option chain for the specified expiration date\n", + " option_chain = stock.option_chain(expiration_date_str)\n", + "\n", + " # Get call options (or you can use puts as well based on your requirement)\n", + " calls = option_chain.calls\n", + "\n", + " # Convert expiration_date_str to QuantLib Date\n", + " expiry_date_parts = list(map(int, expiration_date_str.split('-'))) # Split YYYY-MM-DD\n", + " maturity_date = ql.Date(expiry_date_parts[2], expiry_date_parts[1], expiry_date_parts[0]) # Convert to QuantLib Date\n", + "\n", + " # Create a list to store strike prices, maturity dates, and option prices\n", + " market_data = []\n", + " for index, row in calls.iterrows():\n", + " strike = row['strike']\n", + " option_price = row['lastPrice'] # You can also use 'bid', 'ask', 'mid', etc.\n", + " market_data.append((strike, maturity_date, option_price))\n", + " df = pd.DataFrame(market_data, columns = ['strike', 'maturity_date', 'option_price'])\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "c7769b73", + "metadata": {}, + "source": [ + "Let's define helper function retrieve to stock data from Yahoo Finance. This helper function to calculate spot price, dividend yield, volatility and risk free rate using the underline stock data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc44c448", + "metadata": {}, + "outputs": [], + "source": [ + "def get_option_parameters(ticker):\n", + " # Fetch historical data for the stock\n", + " stock_data = yf.Ticker(ticker)\n", + " \n", + " # Get the current spot price\n", + " spot_price = stock_data.history(period=\"1d\")['Close'].iloc[-1]\n", + " \n", + " # Get dividend yield\n", + " dividend_rate = stock_data.dividends.mean() / spot_price if not stock_data.dividends.empty else 0.0\n", + " \n", + " # Estimate volatility (standard deviation of log returns)\n", + " hist_data = stock_data.history(period=\"1y\")['Close']\n", + " log_returns = np.log(hist_data / hist_data.shift(1)).dropna()\n", + " volatility = np.std(log_returns) * np.sqrt(252) # Annualized volatility\n", + " \n", + " # Assume a risk-free rate from some known data (can be fetched from market data, here we use 0.001)\n", + " risk_free_rate = 0.001\n", + " \n", + " # Return the calculated parameters\n", + " return {\n", + " \"spot_price\": spot_price,\n", + " \"volatility\": volatility,\n", + " \"dividend_rate\": dividend_rate,\n", + " \"risk_free_rate\": risk_free_rate\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "c7b739d3", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Market Data Quality and Availability\n", + "Next, let's specify ticker and expiration date to get market data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50225fde", + "metadata": {}, + "outputs": [], + "source": [ + "ticker = \"MSFT\"\n", + "expiration_date = \"2024-12-13\" # Example expiration date in 'YYYY-MM-DD' form\n", + "\n", + "market_data = get_market_data(ticker=ticker, expiration_date_str=expiration_date)" + ] + }, + { + "cell_type": "markdown", + "id": "c539b95e", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind datasets\n", + "\n", + "Before you can run tests, you must first initialize a ValidMind dataset object using the [`init_dataset`](https://docs.validmind.ai/validmind/validmind.html#init_dataset) function from the ValidMind (`vm`) module." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "113f9c17", + "metadata": {}, + "outputs": [], + "source": [ + "vm_market_data = vm.init_dataset(\n", + " dataset=market_data,\n", + " input_id=\"market_data\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "185beb24", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Data Quality\n", + "Let's check quality of the data using outliers and missing data tests." + ] + }, + { + "cell_type": "markdown", + "id": "7f14464c", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Isolation Forest Outliers Test\n", + "Let's detects anomalies in the dataset using the Isolation Forest algorithm, visualized through scatter plots." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56c919ec", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"validmind.data_validation.IsolationForestOutliers\",\n", + " inputs={\n", + " \"dataset\": vm_market_data,\n", + " },\n", + " title=\"Outliers detection using Isolation Forest\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e4d0e5ca", + "metadata": {}, + "source": [ + "##### Missing Values Test\n", + "Let's evaluates dataset quality by ensuring the missing value ratio across all features does not exceed a set threshold." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e95c825f", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"validmind.data_validation.MissingValues\",\n", + " inputs={\n", + " \"dataset\": vm_market_data,\n", + " },\n", + " title=\"Missing Values detection\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "829403a3", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Model parameters\n", + "Let's calculate the model parameters using from stock data " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25936449", + "metadata": {}, + "outputs": [], + "source": [ + "option_params = get_option_parameters(ticker=ticker)" + ] + }, + { + "cell_type": "markdown", + "id": "0a0948b6", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Model development - Heston Option price" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e15b8221", + "metadata": {}, + "outputs": [], + "source": [ + "class HestonModel:\n", + "\n", + " def __init__(self, ticker, expiration_date_str, calculation_date, spot_price, dividend_rate, risk_free_rate):\n", + " self.ticker = ticker\n", + " self.expiration_date_str = expiration_date_str,\n", + " self.calculation_date = calculation_date\n", + " self.spot_price = spot_price\n", + " self.dividend_rate = dividend_rate\n", + " self.risk_free_rate = risk_free_rate\n", + " \n", + " def predict_option_price(self, strike, maturity_date, spot_price, v0=None, theta=None, kappa=None, sigma=None, rho=None):\n", + " # Set the evaluation date\n", + " ql.Settings.instance().evaluationDate = self.calculation_date\n", + "\n", + " # Construct the European Option\n", + " payoff = ql.PlainVanillaPayoff(ql.Option.Call, strike)\n", + " exercise = ql.EuropeanExercise(maturity_date)\n", + " european_option = ql.VanillaOption(payoff, exercise)\n", + "\n", + " # Yield term structures for risk-free rate and dividend\n", + " riskFreeTS = ql.YieldTermStructureHandle(ql.FlatForward(calculation_date, self.risk_free_rate, ql.Actual365Fixed()))\n", + " dividendTS = ql.YieldTermStructureHandle(ql.FlatForward(calculation_date, self.dividend_rate, ql.Actual365Fixed()))\n", + "\n", + " # Initial stock price\n", + " initialValue = ql.QuoteHandle(ql.SimpleQuote(spot_price))\n", + "\n", + " # Heston process parameters\n", + " heston_process = ql.HestonProcess(riskFreeTS, dividendTS, initialValue, v0, kappa, theta, sigma, rho)\n", + " hestonModel = ql.HestonModel(heston_process)\n", + "\n", + " # Use the Heston analytic engine\n", + " engine = ql.AnalyticHestonEngine(hestonModel)\n", + " european_option.setPricingEngine(engine)\n", + "\n", + " # Calculate the Heston model price\n", + " h_price = european_option.NPV()\n", + "\n", + " return h_price\n", + "\n", + " def predict_american_option_price(self, strike, maturity_date, spot_price, v0=None, theta=None, kappa=None, sigma=None, rho=None):\n", + " # Set the evaluation date\n", + " ql.Settings.instance().evaluationDate = self.calculation_date\n", + "\n", + " # Construct the American Option\n", + " payoff = ql.PlainVanillaPayoff(ql.Option.Call, strike)\n", + " exercise = ql.AmericanExercise(self.calculation_date, maturity_date)\n", + " american_option = ql.VanillaOption(payoff, exercise)\n", + "\n", + " # Yield term structures for risk-free rate and dividend\n", + " riskFreeTS = ql.YieldTermStructureHandle(ql.FlatForward(self.calculation_date, self.risk_free_rate, ql.Actual365Fixed()))\n", + " dividendTS = ql.YieldTermStructureHandle(ql.FlatForward(self.calculation_date, self.dividend_rate, ql.Actual365Fixed()))\n", + "\n", + " # Initial stock price\n", + " initialValue = ql.QuoteHandle(ql.SimpleQuote(spot_price))\n", + "\n", + " # Heston process parameters\n", + " heston_process = ql.HestonProcess(riskFreeTS, dividendTS, initialValue, v0, kappa, theta, sigma, rho)\n", + " heston_model = ql.HestonModel(heston_process)\n", + "\n", + "\n", + " payoff = ql.PlainVanillaPayoff(ql.Option.Call, strike)\n", + " exercise = ql.AmericanExercise(self.calculation_date, maturity_date)\n", + " american_option = ql.VanillaOption(payoff, exercise)\n", + " heston_fd_engine = ql.FdHestonVanillaEngine(heston_model)\n", + " american_option.setPricingEngine(heston_fd_engine)\n", + " option_price = american_option.NPV()\n", + "\n", + " return option_price\n", + "\n", + " def objective_function(self, params, market_data, spot_price, dividend_rate, risk_free_rate):\n", + " v0, theta, kappa, sigma, rho = params\n", + "\n", + " # Sum of squared differences between market prices and model prices\n", + " error = 0.0\n", + " for i, row in market_data.iterrows():\n", + " model_price = self.predict_option_price(row['strike'], row['maturity_date'], spot_price, \n", + " v0, theta, kappa, sigma, rho)\n", + " error += (model_price - row['option_price']) ** 2\n", + " \n", + " return error\n", + "\n", + " def calibrate_model(self, ticker, expiration_date_str):\n", + " # Get the option market data dynamically from Yahoo Finance\n", + " market_data = get_market_data(ticker, expiration_date_str)\n", + "\n", + " # Initial guesses for Heston parameters\n", + " initial_params = [0.04, 0.04, 0.1, 0.1, -0.75]\n", + "\n", + " # Bounds for the parameters to ensure realistic values\n", + " bounds = [(0.0001, 1.0), # v0\n", + " (0.0001, 1.0), # theta\n", + " (0.001, 2.0), # kappa\n", + " (0.001, 1.0), # sigma\n", + " (-0.75, 0.0)] # rho\n", + "\n", + " # Optimize the parameters to minimize the error between model and market prices\n", + " result = minimize(self.objective_function, initial_params, args=(market_data, self.spot_price, self.dividend_rate, self.risk_free_rate),\n", + " bounds=bounds, method='L-BFGS-B')\n", + "\n", + " # Optimized Heston parameters\n", + " v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt = result.x\n", + "\n", + " return v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt\n" + ] + }, + { + "cell_type": "markdown", + "id": "a941aa32", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Model Calibration\n", + "* The calibration process aims to optimize the Heston model parameters (v0, theta, kappa, sigma, rho) by minimizing the difference between model-predicted option prices and observed market prices.\n", + "* In this implementation, the model is calibrated to current market data, specifically using option prices from the selected ticker and expiration date.\n", + "\n", + "Let's specify `calculation_date` and `strike_price` as input parameters for the model to verify its functionality and confirm it operates as expected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d61dfca", + "metadata": {}, + "outputs": [], + "source": [ + "calculation_date = ql.Date(26, 11, 2024)\n", + "# Convert expiration date string to QuantLib.Date\n", + "expiry_date_parts = list(map(int, expiration_date.split('-')))\n", + "maturity_date = ql.Date(expiry_date_parts[2], expiry_date_parts[1], expiry_date_parts[0])\n", + "strike_price = 460.0\n", + "\n", + "hm = HestonModel(\n", + " ticker=ticker,\n", + " expiration_date_str= expiration_date,\n", + " calculation_date= calculation_date,\n", + " spot_price= option_params['spot_price'],\n", + " dividend_rate = option_params['dividend_rate'],\n", + " risk_free_rate = option_params['risk_free_rate']\n", + ")\n", + "\n", + "# Let's calibrate model\n", + "v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt = hm.calibrate_model(ticker, expiration_date)\n", + "print(f\"Optimized Heston parameters: v0={v0_opt}, theta={theta_opt}, kappa={kappa_opt}, sigma={sigma_opt}, rho={rho_opt}\")\n", + "\n", + "\n", + "# option price\n", + "h_price = hm.predict_option_price(strike_price, maturity_date, option_params['spot_price'], v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt)\n", + "print(\"The Heston model price for the option is:\", h_price)" + ] + }, + { + "cell_type": "markdown", + "id": "75313272", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Model Evaluation" + ] + }, + { + "cell_type": "markdown", + "id": "2e6471ef", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Benchmark Testing\n", + "The benchmark testing framework provides a robust way to validate the Heston model implementation and understand the relationships between European and American option prices under stochastic volatility conditions.\n", + "Let's compares European and American option prices using the Heston model." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "810cf887", + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.BenchmarkTest\")\n", + "def benchmark_test(hm_model, strikes, maturity_date, spot_price, v0=None, theta=None, kappa=None, sigma=None, rho=None):\n", + " \"\"\"\n", + " Compares European and American option prices using the Heston model.\n", + "\n", + " This test evaluates the price differences between European and American options\n", + " across multiple strike prices while keeping other parameters constant. The comparison\n", + " helps understand the early exercise premium of American options over their European\n", + " counterparts under stochastic volatility conditions.\n", + "\n", + " Args:\n", + " hm_model: HestonModel instance for option pricing calculations\n", + " strikes (list[float]): List of strike prices to test\n", + " maturity_date (ql.Date): Option expiration date in QuantLib format\n", + " spot_price (float): Current price of the underlying asset\n", + " v0 (float, optional): Initial variance. Defaults to None.\n", + " theta (float, optional): Long-term variance. Defaults to None.\n", + " kappa (float, optional): Mean reversion rate. Defaults to None.\n", + " sigma (float, optional): Volatility of variance. Defaults to None.\n", + " rho (float, optional): Correlation between asset and variance. Defaults to None.\n", + "\n", + " Returns:\n", + " dict: Contains a DataFrame with the following columns:\n", + " - Strike: Strike prices tested\n", + " - Maturity date: Expiration date for all options\n", + " - Spot price: Current underlying price\n", + " - european model price: Prices for European options\n", + " - american model price: Prices for American options\n", + "\"\"\"\n", + " american_derived_prices = []\n", + " european_derived_prices = []\n", + " for K in strikes:\n", + " european_derived_prices.append(hm_model.predict_option_price(K, maturity_date, spot_price, v0, theta, kappa, sigma, rho))\n", + " american_derived_prices.append(hm_model.predict_american_option_price(K, maturity_date, spot_price, v0, theta, kappa, sigma, rho))\n", + "\n", + " data = {\n", + " \"Strike\": strikes,\n", + " \"Maturity date\": [maturity_date] * len(strikes),\n", + " \"Spot price\": [spot_price] * len(strikes),\n", + " \"european model price\": european_derived_prices,\n", + " \"american model price\": american_derived_prices,\n", + "\n", + " }\n", + " df1 = pd.DataFrame(data)\n", + " return {\"strikes variation benchmarking\": df1}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fdd6705", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.BenchmarkTest\",\n", + " params={\n", + " \"hm_model\": hm,\n", + " \"strikes\": [400, 425, 460, 495, 520],\n", + " \"maturity_date\": maturity_date,\n", + " \"spot_price\": option_params['spot_price'],\n", + " \"v0\":v0_opt,\n", + " \"theta\": theta_opt,\n", + " \"kappa\":kappa_opt ,\n", + " \"sigma\": sigma_opt,\n", + " \"rho\":rho_opt\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "e359b503", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Sensitivity Testing\n", + "The sensitivity testing framework provides a systematic approach to understanding how the Heston model responds to parameter changes, which is crucial for both model validation and practical application in trading and risk management." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51922313", + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_test_provider.Sensitivity\")\n", + "def SensitivityTest(\n", + " model,\n", + " strike_price,\n", + " maturity_date,\n", + " spot_price,\n", + " v0_opt,\n", + " theta_opt,\n", + " kappa_opt,\n", + " sigma_opt,\n", + " rho_opt,\n", + "):\n", + " \"\"\"\n", + " Evaluates the sensitivity of American option prices to changes in model parameters.\n", + "\n", + " This test calculates option prices using the Heston model with optimized parameters.\n", + " It's designed to analyze how changes in various model inputs affect the option price,\n", + " which is crucial for understanding model behavior and risk management.\n", + "\n", + " Args:\n", + " model (HestonModel): Initialized Heston model instance wrapped in ValidMind model object\n", + " strike_price (float): Strike price of the option\n", + " maturity_date (ql.Date): Expiration date of the option in QuantLib format\n", + " spot_price (float): Current price of the underlying asset\n", + " v0_opt (float): Optimized initial variance parameter\n", + " theta_opt (float): Optimized long-term variance parameter\n", + " kappa_opt (float): Optimized mean reversion rate parameter\n", + " sigma_opt (float): Optimized volatility of variance parameter\n", + " rho_opt (float): Optimized correlation parameter between asset price and variance\n", + " \"\"\"\n", + " price = model.model.predict_american_option_price(\n", + " strike_price,\n", + " maturity_date,\n", + " spot_price,\n", + " v0_opt,\n", + " theta_opt,\n", + " kappa_opt,\n", + " sigma_opt,\n", + " rho_opt,\n", + " )\n", + "\n", + " return price\n" + ] + }, + { + "cell_type": "markdown", + "id": "408a05ef", + "metadata": {}, + "source": [ + "##### Common plot function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "104ca6dd", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_results(df, params: dict = None):\n", + " fig2 = plt.figure(figsize=(10, 6))\n", + " plt.plot(df[params[\"x\"]], df[params[\"y\"]], label=params[\"label\"])\n", + " plt.xlabel(params[\"xlabel\"])\n", + " plt.ylabel(params[\"ylabel\"])\n", + " \n", + " plt.title(params[\"title\"])\n", + " plt.legend()\n", + " plt.grid(True)\n", + " plt.show() # close the plot to avoid displaying it" + ] + }, + { + "cell_type": "markdown", + "id": "ca72b9e5", + "metadata": {}, + "source": [ + "Let's create ValidMind model object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae7093fa", + "metadata": {}, + "outputs": [], + "source": [ + "hm_model = vm.init_model(model=hm, input_id=\"HestonModel\")" + ] + }, + { + "cell_type": "markdown", + "id": "b2141640", + "metadata": {}, + "source": [ + "##### Strike sensitivity\n", + "Let's analyzes how option prices change as the strike price varies. We create a range of strike prices around the current strike (460) and observe the impact on option prices while keeping all other parameters constant." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea7f1cbe", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_test_provider.Sensitivity:ToStrike\",\n", + " inputs = {\n", + " \"model\": hm_model\n", + " },\n", + " param_grid={\n", + " \"strike_price\": list(np.linspace(460-50, 460+50, 10)),\n", + " \"maturity_date\": [maturity_date],\n", + " \"spot_price\": [option_params[\"spot_price\"]],\n", + " \"v0_opt\": [v0_opt],\n", + " \"theta_opt\": [theta_opt],\n", + " \"kappa_opt\": [kappa_opt],\n", + " \"sigma_opt\": [sigma_opt],\n", + " \"rho_opt\":[rho_opt]\n", + " },\n", + ")\n", + "result.log()\n", + "# Visualize how option prices change with different strike prices\n", + "plot_results(\n", + " pd.DataFrame(result.tables[0].data),\n", + " params={\n", + " \"x\": \"strike_price\",\n", + " \"y\":\"Value\",\n", + " \"label\":\"Strike price\",\n", + " \"xlabel\":\"Strike price\",\n", + " \"ylabel\":\"option price\",\n", + " \"title\":\"Heston option - Strike price Sensitivity\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "be143012", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Stress Testing\n", + "This stress testing framework provides a comprehensive view of how the Heston model behaves under different market conditions and helps identify potential risks in option pricing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2f01a40", + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.Stressing\")\n", + "def StressTest(\n", + " model,\n", + " strike_price,\n", + " maturity_date,\n", + " spot_price,\n", + " v0_opt,\n", + " theta_opt,\n", + " kappa_opt,\n", + " sigma_opt,\n", + " rho_opt,\n", + "):\n", + " \"\"\"\n", + " Performs stress testing on Heston model parameters to evaluate option price sensitivity.\n", + "\n", + " This test evaluates how the American option price responds to stressed market conditions\n", + " by varying key model parameters. It's designed to:\n", + " 1. Identify potential model vulnerabilities\n", + " 2. Understand price behavior under extreme scenarios\n", + " 3. Support risk management decisions\n", + " 4. Validate model stability across parameter ranges\n", + "\n", + " Args:\n", + " model (HestonModel): Initialized Heston model instance wrapped in ValidMind model object\n", + " strike_price (float): Option strike price\n", + " maturity_date (ql.Date): Option expiration date in QuantLib format\n", + " spot_price (float): Current price of the underlying asset\n", + " v0_opt (float): Initial variance parameter under stress testing\n", + " theta_opt (float): Long-term variance parameter under stress testing\n", + " kappa_opt (float): Mean reversion rate parameter under stress testing\n", + " sigma_opt (float): Volatility of variance parameter under stress testing\n", + " rho_opt (float): Correlation parameter under stress testing\n", + " \"\"\"\n", + " price = model.model.predict_american_option_price(\n", + " strike_price,\n", + " maturity_date,\n", + " spot_price,\n", + " v0_opt,\n", + " theta_opt,\n", + " kappa_opt,\n", + " sigma_opt,\n", + " rho_opt,\n", + " )\n", + "\n", + " return price\n" + ] + }, + { + "cell_type": "markdown", + "id": "31fcbe9c", + "metadata": {}, + "source": [ + "##### Rho (correlation) and Theta (long term vol) stress test\n", + "Next, let's evaluates the sensitivity of a model's output to changes in the correlation parameter (rho) and the long-term variance parameter (theta) within a stochastic volatility framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6119b5d9", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.Stressing:TheRhoAndThetaParameters\",\n", + " inputs = {\n", + " \"model\": hm_model,\n", + " },\n", + " param_grid={\n", + " \"strike_price\": [460],\n", + " \"maturity_date\": [maturity_date],\n", + " \"spot_price\": [option_params[\"spot_price\"]],\n", + " \"v0_opt\": [v0_opt],\n", + " \"theta_opt\": list(np.linspace(0.1, theta_opt+0.4, 5)),\n", + " \"kappa_opt\": [kappa_opt],\n", + " \"sigma_opt\": [sigma_opt],\n", + " \"rho_opt\":list(np.linspace(rho_opt-0.2, rho_opt+0.2, 5))\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "be39cb3a", + "metadata": {}, + "source": [ + "##### Sigma stress test\n", + "Let's evaluates the sensitivity of a model's output to changes in the volatility parameter, sigma. This test is crucial for understanding how variations in market volatility impact the model's valuation of financial instruments, particularly options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dc189b7", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.Stressing:TheSigmaParameter\",\n", + " inputs = {\n", + " \"model\": hm_model,\n", + " },\n", + " param_grid={\n", + " \"strike_price\": [460],\n", + " \"maturity_date\": [maturity_date],\n", + " \"spot_price\": [option_params[\"spot_price\"]],\n", + " \"v0_opt\": [v0_opt],\n", + " \"theta_opt\": [theta_opt],\n", + " \"kappa_opt\": [kappa_opt],\n", + " \"sigma_opt\": list(np.linspace(0.1, sigma_opt+0.6, 5)),\n", + " \"rho_opt\": [rho_opt]\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "173a5294", + "metadata": {}, + "source": [ + "##### Stress kappa\n", + "Let's evaluates the sensitivity of a model's output to changes in the kappa parameter, which is a mean reversion rate in stochastic volatility models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dae9714f", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.Stressing:TheKappaParameter\",\n", + " inputs = {\n", + " \"model\": hm_model,\n", + " },\n", + " param_grid={\n", + " \"strike_price\": [460],\n", + " \"maturity_date\": [maturity_date],\n", + " \"spot_price\": [option_params[\"spot_price\"]],\n", + " \"v0_opt\": [v0_opt],\n", + " \"theta_opt\": [theta_opt],\n", + " \"kappa_opt\": list(np.linspace(kappa_opt, kappa_opt+0.2, 5)),\n", + " \"sigma_opt\": [sigma_opt],\n", + " \"rho_opt\": [rho_opt]\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "b4d1d968", + "metadata": {}, + "source": [ + "##### Stress theta\n", + "Let's evaluates the sensitivity of a model's output to changes in the parameter theta, which represents the long-term variance in a stochastic volatility model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e68df3db", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.Stressing:TheThetaParameter\",\n", + " inputs = {\n", + " \"model\": hm_model,\n", + " },\n", + " param_grid={\n", + " \"strike_price\": [460],\n", + " \"maturity_date\": [maturity_date],\n", + " \"spot_price\": [option_params[\"spot_price\"]],\n", + " \"v0_opt\": [v0_opt],\n", + " \"theta_opt\": list(np.linspace(0.1, theta_opt+0.9, 5)),\n", + " \"kappa_opt\": [kappa_opt],\n", + " \"sigma_opt\": [sigma_opt],\n", + " \"rho_opt\": [rho_opt]\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "32e70456", + "metadata": {}, + "source": [ + "##### Stress rho\n", + "Let's evaluates the sensitivity of a model's output to changes in the correlation parameter, rho, within a stochastic volatility (SV) model framework. This test is crucial for understanding how variations in rho, which represents the correlation between the asset price and its volatility, impact the model's valuation output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5ca3fc2", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.Stressing:TheRhoParameter\",\n", + " inputs = {\n", + " \"model\": hm_model,\n", + " },\n", + " param_grid={\n", + " \"strike_price\": [460],\n", + " \"maturity_date\": [maturity_date],\n", + " \"spot_price\": [option_params[\"spot_price\"]],\n", + " \"v0_opt\": [v0_opt],\n", + " \"theta_opt\": [theta_opt],\n", + " \"kappa_opt\": [kappa_opt],\n", + " \"sigma_opt\": [sigma_opt],\n", + " \"rho_opt\": list(np.linspace(rho_opt-0.2, rho_opt+0.2, 5))\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "892c5347", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps\n", + "\n", + "You can look at the results of this test suite right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation.\n", + "\n", + "\n", + "\n", + "### Work with your model documentation\n", + "\n", + "1. From the **Model Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", + "\n", + "2. Click and expand the **Model Development** section.\n", + "\n", + "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/model-documentation/working-with-model-documentation.html)\n", + "\n", + "\n", + "\n", + "### Discover more learning resources\n", + "\n", + "We offer many interactive notebooks to help you document models:\n", + "\n", + "- [Run tests & test suites](https://docs.validmind.ai/developer/model-testing/testing-overview.html)\n", + "- [Code samples](https://docs.validmind.ai/developer/samples-jupyter-notebooks.html)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-de5d1e182b09403abddabc2850f2dd05", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-1QuffXMV-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/site/notebooks/code_samples/code_explainer/customer_churn_full_suite.py b/site/notebooks/use_cases/code_explainer/customer_churn_full_suite.py similarity index 98% rename from site/notebooks/code_samples/code_explainer/customer_churn_full_suite.py rename to site/notebooks/use_cases/code_explainer/customer_churn_full_suite.py index 6bd9c2a8ab..3f7c828092 100644 --- a/site/notebooks/code_samples/code_explainer/customer_churn_full_suite.py +++ b/site/notebooks/use_cases/code_explainer/customer_churn_full_suite.py @@ -7,7 +7,7 @@ Welcome! Let's get you started with the basic process of documenting models with ValidMind. -You will learn how to initialize the ValidMind Library, load a sample dataset to train a simple classification model, +You will learn how to initialize the ValidMind Library, load a sample dataset to train a simple classification model, and then run a ValidMind test suite to quickly generate documentation about the data and model. This script uses the Bank Customer Churn Prediction sample dataset from Kaggle to train the classification model. @@ -162,8 +162,8 @@ def load_model(self, version: str) -> Any: ) vm_test_ds = vm.init_dataset( - dataset=test_df, - input_id="test_dataset", + dataset=test_df, + input_id="test_dataset", target_column=customer_churn.target_column ) diff --git a/site/notebooks/use_cases/code_explainer/model_source_code_documentation_template.yaml b/site/notebooks/use_cases/code_explainer/model_source_code_documentation_template.yaml new file mode 100644 index 0000000000..7065b35a7d --- /dev/null +++ b/site/notebooks/use_cases/code_explainer/model_source_code_documentation_template.yaml @@ -0,0 +1,142 @@ +# Copyright © 2023-2026 ValidMind Inc. All rights reserved. +# Refer to the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial + +- id: code_overview + title: Codebase Overview + guidelines: + - Describe the overall structure of the source code repository. + - Identify main modules, folders, and scripts. + - Highlight entry points for training, inference, and evaluation. + - State the main programming languages and frameworks used. + contents: + - content_type: text + content_id: code_structure_summary +- id: model_overview + title: Model Overview + guidelines: + - Describe the overall structure of the source code repository. + - Identify main modules, folders, and scripts. + - Highlight entry points for training, inference, and evaluation. + - State the main programming languages and frameworks used. + contents: + - content_type: text + content_id: model_overview +- id: environment_setup + title: Environment and Dependencies + guidelines: + - List Python packages and system dependencies (OS, compilers, etc.). + - Reference environment files (requirements.txt, environment.yml, + Dockerfile). + - Include setup instructions using Conda, virtualenv, or containers. + contents: + - content_type: text + content_id: setup_instructions +- id: data_interface + title: Data Ingestion and Preprocessing + guidelines: + - Specify data input formats and sources. + - Document ingestion, validation, and transformation logic. + - Explain how raw data is preprocessed and features are generated. + contents: + - content_type: text + content_id: data_handling_notes +- id: model_implementation + title: Model Implementation Details + guidelines: + - Describe the core model code structure (classes, functions). + - Link code to theoretical models or equations when applicable. + - Note custom components like loss functions or feature selectors. + contents: + - content_type: text + content_id: model_code_description +- id: training_pipeline + title: Model Training Pipeline + guidelines: + - Explain the training process, optimization strategy, and hyperparameters. + - Describe logging, checkpointing, and early stopping mechanisms. + - Include references to training config files or tuning logic. + contents: + - content_type: text + content_id: training_logic_details +- id: evaluation_pipeline + title: Evaluation and Validation Code + guidelines: + - Describe how validation is implemented and metrics are calculated. + - Include plots and diagnostic tools (e.g., ROC, SHAP, confusion matrix). + - State how outputs are logged and persisted. + contents: + - content_type: text + content_id: evaluation_logic_notes +- id: inference_pipeline + title: Inference and Scoring Logic + guidelines: + - Detail how the trained model is loaded and used for predictions. + - Explain I/O formats and APIs for serving or batch scoring. + - Include any preprocessing/postprocessing logic required. + contents: + - content_type: text + content_id: inference_mechanism +- id: configuration_management + title: Configuration and Parameters + guidelines: + - Describe configuration management (files, CLI args, env vars). + - Highlight default parameters and override mechanisms. + - Reference versioning practices for config files. + contents: + - content_type: text + content_id: config_control_notes +- id: testing_and_validation + title: Unit and Integration Testing + guidelines: + - List unit and integration tests and what they cover. + - Mention testing frameworks and coverage tools used. + - Explain testing strategy for production-readiness. + contents: + - content_type: text + content_id: test_strategy_overview +- id: logging + title: Logging and Monitoring Hooks + guidelines: + - Describe logging configuration and structure. + - Highlight real-time monitoring or observability integrations. + - List key events, metrics, or alerts tracked. + contents: + - content_type: text + content_id: logging_notes +- id: version_control + title: Code and Model Versioning + guidelines: + - Describe Git usage, branching, tagging, and commit standards. + - Include model artifact versioning practices (e.g., DVC, MLflow). + - Reference any automation in CI/CD. + contents: + - content_type: text + content_id: version_tracking_description +- id: security_and_compliance + title: Security and Access Control + guidelines: + - Document access controls for source code and data. + - Include any encryption, PII handling, or compliance measures. + - Mention secure deployment practices. + contents: + - content_type: text + content_id: security_policies_notes +- id: execution_examples + title: Example Runs and Scripts + guidelines: + - Provide working script examples (e.g., `train.py`, `predict.py`). + - Include CLI usage instructions or sample notebooks. + - Link to demo datasets or test scenarios. + contents: + - content_type: text + content_id: runnable_examples +- id: known_issues_and_todos + title: Known Issues and Future Improvements + guidelines: + - List current limitations or technical debt. + - Outline proposed enhancements or refactors. + - Reference relevant tickets, GitHub issues, or roadmap items. + contents: + - content_type: text + content_id: issues_and_improvements_log diff --git a/site/notebooks/use_cases/code_explainer/quickstart_code_explainer_demo.ipynb b/site/notebooks/use_cases/code_explainer/quickstart_code_explainer_demo.ipynb new file mode 100644 index 0000000000..99ee1abf66 --- /dev/null +++ b/site/notebooks/use_cases/code_explainer/quickstart_code_explainer_demo.ipynb @@ -0,0 +1,874 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quickstart for model code documentation\n", + "\n", + "Welcome! This notebook demonstrates how to use the ValidMind code explainer to automatically generate comprehensive documentation for your codebase. The code explainer analyzes your source code and provides detailed explanations across various aspects of your implementation.\n", + "\n", + "\n", + "\n", + "## About Code Explainer\n", + "The ValidMind code explainer is a powerful tool that automatically analyzes your source code and generates comprehensive documentation. It helps you:\n", + "\n", + "- Understand the structure and organization of your codebase\n", + "- Document dependencies and environment setup\n", + "- Explain data processing and model implementation details\n", + "- Document training, evaluation, and inference pipelines\n", + "- Track configuration, testing, and security measures\n", + "\n", + "This tool is particularly useful for:\n", + "- Onboarding new team members\n", + "- Maintaining up-to-date documentation\n", + "- Ensuring code quality and best practices\n", + "- Facilitating code reviews and audits" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About Code Explainer](#toc1__) \n", + "- [About ValidMind](#toc2__) \n", + " - [Before you begin](#toc2_1__) \n", + " - [New to ValidMind?](#toc2_2__) \n", + " - [Key concepts](#toc2_3__) \n", + "- [Setting up](#toc3__) \n", + " - [Install the ValidMind Library](#toc3_1__) \n", + " - [Initialize the ValidMind Library](#toc3_2__) \n", + " - [Register sample model](#toc3_2_1__) \n", + " - [Apply documentation template](#toc3_2_2__) \n", + " - [Get your code snippet](#toc3_2_3__) \n", + " - [Preview the documentation template](#toc3_3__) \n", + "- [Common function](#toc4__) \n", + "- [Default Behavior](#toc5__) \n", + "- [Codebase Overview](#toc6__) \n", + "- [Environment and Dependencies ('environment_setup')](#toc7__) \n", + "- [Data Ingestion and Preprocessing](#toc8__) \n", + "- [Model Implementation Details](#toc9__) \n", + "- [Model Training Pipeline](#toc10__) \n", + "- [Evaluation and Validation Code](#toc11__) \n", + "- [Inference and Scoring Logic](#toc12__) \n", + "- [Configuration and Parameters](#toc13__) \n", + "- [Unit and Integration Testing](#toc14__) \n", + "- [Logging and Monitoring Hooks](#toc15__) \n", + "- [Code and Model Versioning](#toc16__) \n", + "- [Security and Access Control](#toc17__) \n", + "- [Example Runs and Scripts](#toc18__) \n", + "- [Known Issues and Future Improvements](#toc19__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Model Source Code Documentation`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Can't select this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Preview the documentation template\n", + "\n", + "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", + "\n", + "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Common function\n", + "The code above defines two key functions:\n", + "1. A function to read source code from 'customer_churn_full_suite.py' file\n", + "2. An 'explain_code' function that uses ValidMind's experimental agents to analyze and explain code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_code=\"\"\n", + "with open(\"customer_churn_full_suite.py\", \"r\") as f:\n", + " source_code = f.read()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `vm.experimental.agents.run_task` function is used to execute AI agent tasks.\n", + "\n", + "It requires:\n", + "- task: The type of task to run (e.g. `code_explainer`)\n", + "- input: A dictionary containing task-specific parameters\n", + " - For `code_explainer`, this includes:\n", + " - **source_code** (str): The code to be analyzed\n", + " - **user_instructions** (str): Instructions for how to analyze the code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def explain_code(content_id: str, user_instructions: str):\n", + " \"\"\"Run code explanation task and log the results.\n", + " By default, the code explainer includes sections for:\n", + " - Main Purpose and Overall Functionality\n", + " - Breakdown of Key Functions or Components\n", + " - Potential Risks or Failure Points \n", + " - Assumptions or Limitations\n", + " If you want default sections, specify user_instructions as an empty string.\n", + " \n", + " Args:\n", + " user_instructions (str): Instructions for how to analyze the code\n", + " content_id (str): ID to use when logging the results\n", + " \n", + " Returns:\n", + " The result object from running the code explanation task\n", + " \"\"\"\n", + " result = vm.experimental.agents.run_task(\n", + " task=\"code_explainer\",\n", + " input={\n", + " \"source_code\": source_code,\n", + " \"user_instructions\": user_instructions\n", + " }\n", + " )\n", + " result.log(content_id=content_id)\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Default Behavior" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the code explainer includes sections for:\n", + "- Main Purpose and Overall Functionality\n", + "- Breakdown of Key Functions or Components\n", + "- Potential Risks or Failure Points \n", + "- Assumptions or Limitations\n", + "\n", + "If you want default sections, specify `user_instructions` as an empty string. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = vm.experimental.agents.run_task(\n", + " task=\"code_explainer\",\n", + " input={\n", + " \"source_code\": source_code,\n", + " \"user_instructions\": \"\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Codebase Overview\n", + "\n", + "Let's analyze your codebase structure to understand the main modules, components, entry points and their relationships. We'll also examine the technology stack and frameworks that are being utilized in the implementation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Describe the overall structure of the source code repository.\n", + " - Identify main modules, folders, and scripts.\n", + " - Highlight entry points for training, inference, and evaluation.\n", + " - State the main programming languages and frameworks used.\n", + " \"\"\",\n", + " content_id=\"code_structure_summary\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\",\n", + " content_id=\"code_structure_summary\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Environment and Dependencies ('environment_setup')\n", + "Let's document the technical requirements and setup needed to run your code, including Python packages, system dependencies, and environment configuration files. Understanding these requirements is essential for proper development environment setup and consistent deployments across different environments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - List Python packages and system dependencies (OS, compilers, etc.).\n", + " - Reference environment files (requirements.txt, environment.yml, Dockerfile).\n", + " - Include setup instructions using Conda, virtualenv, or containers.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"setup_instructions\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Data Ingestion and Preprocessing\n", + "Let's document how your code handles data, including data sources, validation procedures, and preprocessing steps. We'll examine the data pipeline architecture, covering everything from initial data loading through feature engineering and quality checks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Specify data input formats and sources.\n", + " - Document ingestion, validation, and transformation logic.\n", + " - Explain how raw data is preprocessed and features are generated.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections. \"\"\",\n", + " content_id=\"data_handling_notes\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " \n", + "\n", + "\n", + "\n", + "## Model Implementation Details\n", + "Let's document the core implementation details of your model, including its architecture, components, and key algorithms. Understanding the technical implementation is crucial for maintenance, debugging, and future improvements to the codebase. We'll examine how theoretical concepts are translated into working code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Describe the core model code structure (classes, functions).\n", + " - Link code to theoretical models or equations when applicable.\n", + " - Note custom components like loss functions or feature selectors.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"model_code_description\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Model Training Pipeline\n", + "\n", + "Let's document the training pipeline implementation, including how models are trained, optimized and evaluated. We'll examine the training process workflow, hyperparameter tuning approach, and model checkpointing mechanisms. This section provides insights into how the model learns from data and achieves optimal performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Explain the training process, optimization strategy, and hyperparameters.\n", + " - Describe logging, checkpointing, and early stopping mechanisms.\n", + " - Include references to training config files or tuning logic.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"training_logic_details\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Evaluation and Validation Code\n", + "Let's examine how the model's validation and evaluation code is implemented, including the metrics calculation and validation processes. We'll explore the diagnostic tools and visualization methods used to assess model performance. This section will also cover how validation results are logged and stored for future reference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Describe how validation is implemented and metrics are calculated.\n", + " - Include plots and diagnostic tools (e.g., ROC, SHAP, confusion matrix).\n", + " - State how outputs are logged and persisted.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"evaluation_logic_notes\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Inference and Scoring Logic\n", + "Let's examine how the model performs inference and scoring on new data. This section will cover the implementation details of loading trained models, making predictions, and any required pre/post-processing steps. We'll also look at the APIs and interfaces available for both real-time serving and batch scoring scenarios." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Detail how the trained model is loaded and used for predictions.\n", + " - Explain I/O formats and APIs for serving or batch scoring.\n", + " - Include any preprocessing/postprocessing logic required.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"inference_mechanism\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Configuration and Parameters\n", + "Let's explore how configuration and parameters are managed in the codebase. We'll examine the configuration files, command-line arguments, environment variables, and other mechanisms used to control model behavior. This section will also cover parameter versioning and how different configurations are tracked across model iterations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Describe configuration management (files, CLI args, env vars).\n", + " - Highlight default parameters and override mechanisms.\n", + " - Reference versioning practices for config files.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"config_control_notes\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Unit and Integration Testing\n", + "Let's examine the testing strategy and implementation in the codebase. We'll analyze the unit tests, integration tests, and testing frameworks used to ensure code quality and reliability. This section will also cover test coverage metrics and continuous integration practices." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - List unit and integration tests and what they cover.\n", + " - Mention testing frameworks and coverage tools used.\n", + " - Explain testing strategy for production-readiness.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"test_strategy_overview\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Logging and Monitoring Hooks\n", + "Let's analyze how logging and monitoring are implemented in the codebase. We'll examine the logging configuration, monitoring hooks, and key metrics being tracked. This section will also cover any real-time observability integrations and alerting mechanisms in place." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Describe logging configuration and structure.\n", + " - Highlight real-time monitoring or observability integrations.\n", + " - List key events, metrics, or alerts tracked.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"logging_monitoring_notes\"\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Code and Model Versioning\n", + "Let's examine how code and model versioning is managed in the codebase. This section will cover version control practices, including Git workflows and model artifact versioning tools like DVC or MLflow. We'll also look at how versioning integrates with the CI/CD pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Describe Git usage, branching, tagging, and commit standards.\n", + " - Include model artifact versioning practices (e.g., DVC, MLflow).\n", + " - Reference any automation in CI/CD.\n", + " Please remove the following sections: \n", + " - Potential Risks or Failure Points\n", + " - Assumptions or Limitations\n", + " - Breakdown of Key Functions or Components\n", + " Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"version_tracking_description\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Security and Access Control\n", + "Let's analyze the security and access control measures implemented in the codebase. We'll examine how sensitive data and code are protected through access controls, encryption, and compliance measures. Additionally, we'll review secure deployment practices and any specific handling of PII data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Document access controls for source code and data.\n", + " - Include any encryption, PII handling, or compliance measures.\n", + " - Mention secure deployment practices.\n", + " Please remove the following sections: \n", + " - Potential Risks or Failure Points\n", + " - Assumptions or Limitations\n", + " - Breakdown of Key Functions or Components\n", + " Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"security_policies_notes\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Example Runs and Scripts\n", + "Let's explore example runs and scripts that demonstrate how to use this codebase in practice. We'll look at working examples, command-line usage, and sample notebooks that showcase the core functionality. This section will also point to demo datasets and test scenarios that can help new users get started quickly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Provide working script examples.\n", + " - Include CLI usage instructions or sample notebooks.\n", + " - Link to demo datasets or test scenarios.\n", + " Please remove the following sections: \n", + " - Potential Risks or Failure Points\n", + " - Assumptions or Limitations\n", + " - Breakdown of Key Functions or Components\n", + " Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"runnable_examples\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Known Issues and Future Improvements\n", + "Let's examine the current limitations and areas for improvement in the codebase. This section will document known technical debt, bugs, and feature gaps that need to be addressed. We'll also outline proposed enhancements and reference any existing tickets or GitHub issues tracking these improvements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - List current limitations or technical debt.\n", + " - Outline proposed enhancements or refactors.\n", + " - Reference relevant tickets, GitHub issues, or roadmap items.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"issues_and_improvements_log\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "copyright-ccbede139a26452183291a108b791513", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-1QuffXMV-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/site/notebooks/code_samples/credit_risk/CreditRiskData.xlsx b/site/notebooks/use_cases/credit_risk/CreditRiskData.xlsx similarity index 100% rename from site/notebooks/code_samples/credit_risk/CreditRiskData.xlsx rename to site/notebooks/use_cases/credit_risk/CreditRiskData.xlsx diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb b/site/notebooks/use_cases/credit_risk/application_scorecard_executive.ipynb similarity index 98% rename from site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb rename to site/notebooks/use_cases/credit_risk/application_scorecard_executive.ipynb index 2115a88b50..6ac66e9458 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb +++ b/site/notebooks/use_cases/credit_risk/application_scorecard_executive.ipynb @@ -98,7 +98,7 @@ "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -163,8 +163,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Credit Risk - CECL`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb b/site/notebooks/use_cases/credit_risk/application_scorecard_full_suite.ipynb similarity index 99% rename from site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb rename to site/notebooks/use_cases/credit_risk/application_scorecard_full_suite.ipynb index 83a6d276c8..1666b1d25c 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb +++ b/site/notebooks/use_cases/credit_risk/application_scorecard_full_suite.ipynb @@ -112,7 +112,7 @@ "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -177,8 +177,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Credit Risk - CECL`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_with_bias.ipynb b/site/notebooks/use_cases/credit_risk/application_scorecard_with_bias.ipynb similarity index 99% rename from site/notebooks/code_samples/credit_risk/application_scorecard_with_bias.ipynb rename to site/notebooks/use_cases/credit_risk/application_scorecard_with_bias.ipynb index a915608315..f845c74e22 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_with_bias.ipynb +++ b/site/notebooks/use_cases/credit_risk/application_scorecard_with_bias.ipynb @@ -113,7 +113,7 @@ "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -178,8 +178,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Credit Risk - CECL`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb b/site/notebooks/use_cases/credit_risk/application_scorecard_with_ml.ipynb similarity index 99% rename from site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb rename to site/notebooks/use_cases/credit_risk/application_scorecard_with_ml.ipynb index 357d572b79..7e0cce645e 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb +++ b/site/notebooks/use_cases/credit_risk/application_scorecard_with_ml.ipynb @@ -125,7 +125,7 @@ "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -190,8 +190,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Credit Risk - CECL`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py b/site/notebooks/use_cases/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py similarity index 100% rename from site/notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py rename to site/notebooks/use_cases/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py diff --git a/site/notebooks/code_samples/credit_risk/document_excel_application_scorecard.ipynb b/site/notebooks/use_cases/credit_risk/document_excel_application_scorecard.ipynb similarity index 99% rename from site/notebooks/code_samples/credit_risk/document_excel_application_scorecard.ipynb rename to site/notebooks/use_cases/credit_risk/document_excel_application_scorecard.ipynb index cc7685c9ca..6d693eb538 100644 --- a/site/notebooks/code_samples/credit_risk/document_excel_application_scorecard.ipynb +++ b/site/notebooks/use_cases/credit_risk/document_excel_application_scorecard.ipynb @@ -109,7 +109,7 @@ "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -179,8 +179,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Credit Risk - CECL`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." @@ -488,7 +486,7 @@ "\n", "
Want to learn more about navigating ValidMind tests?\n", "

\n", - "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests
" + "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests" ] }, { diff --git a/site/notebooks/code_samples/model_validation/validate_application_scorecard.ipynb b/site/notebooks/use_cases/model_validation/validate_application_scorecard.ipynb similarity index 99% rename from site/notebooks/code_samples/model_validation/validate_application_scorecard.ipynb rename to site/notebooks/use_cases/model_validation/validate_application_scorecard.ipynb index 22ab661f78..5cc55f8473 100644 --- a/site/notebooks/code_samples/model_validation/validate_application_scorecard.ipynb +++ b/site/notebooks/use_cases/model_validation/validate_application_scorecard.ipynb @@ -151,7 +151,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html))\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -185,8 +185,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Credit Risk — CECL`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down — don’t worry, we’ll adjust these permissions next for validation.\n", "\n", "5. Click **Register Model** to add the model to your inventory." @@ -904,7 +902,7 @@ "source": [ "
Want to learn more about navigating ValidMind tests?\n", "

\n", - "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests
" + "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests" ] }, { @@ -1588,7 +1586,7 @@ "source": [ "
Want to learn more about custom tests?\n", "

\n", - "Refer to our in-depth introduction to custom tests: Implement custom tests
" + "Refer to our in-depth introduction to custom tests: Implement custom tests" ] }, { @@ -1789,7 +1787,7 @@ "\n", "All notebook samples can be found in the following directories of the ValidMind Library GitHub repository:\n", "\n", - "- [Code samples](https://github.com/validmind/validmind-library/tree/main/notebooks/code_samples)\n", + "- [Use cases](https://github.com/validmind/validmind-library/tree/main/notebooks/use_cases)\n", "- [How-to guides](https://github.com/validmind/validmind-library/tree/main/notebooks/how_to)\n", "\n", "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." diff --git a/site/notebooks/code_samples/model_validation/xgb_model_champion.pkl b/site/notebooks/use_cases/model_validation/xgb_model_champion.pkl similarity index 100% rename from site/notebooks/code_samples/model_validation/xgb_model_champion.pkl rename to site/notebooks/use_cases/model_validation/xgb_model_champion.pkl diff --git a/site/notebooks/code_samples/nlp_and_llm/datasets/bbc_text_cls.csv b/site/notebooks/use_cases/nlp_and_llm/datasets/bbc_text_cls.csv similarity index 100% rename from site/notebooks/code_samples/nlp_and_llm/datasets/bbc_text_cls.csv rename to site/notebooks/use_cases/nlp_and_llm/datasets/bbc_text_cls.csv diff --git a/site/notebooks/code_samples/nlp_and_llm/datasets/bbc_text_cls_reference.csv b/site/notebooks/use_cases/nlp_and_llm/datasets/bbc_text_cls_reference.csv similarity index 100% rename from site/notebooks/code_samples/nlp_and_llm/datasets/bbc_text_cls_reference.csv rename to site/notebooks/use_cases/nlp_and_llm/datasets/bbc_text_cls_reference.csv diff --git a/site/notebooks/code_samples/nlp_and_llm/datasets/cnn_dailymail_100_with_predictions.csv b/site/notebooks/use_cases/nlp_and_llm/datasets/cnn_dailymail_100_with_predictions.csv similarity index 100% rename from site/notebooks/code_samples/nlp_and_llm/datasets/cnn_dailymail_100_with_predictions.csv rename to site/notebooks/use_cases/nlp_and_llm/datasets/cnn_dailymail_100_with_predictions.csv diff --git a/site/notebooks/code_samples/nlp_and_llm/datasets/cnn_dailymail_500_with_predictions.csv b/site/notebooks/use_cases/nlp_and_llm/datasets/cnn_dailymail_500_with_predictions.csv similarity index 100% rename from site/notebooks/code_samples/nlp_and_llm/datasets/cnn_dailymail_500_with_predictions.csv rename to site/notebooks/use_cases/nlp_and_llm/datasets/cnn_dailymail_500_with_predictions.csv diff --git a/site/notebooks/code_samples/nlp_and_llm/datasets/sentiments.csv b/site/notebooks/use_cases/nlp_and_llm/datasets/sentiments.csv similarity index 100% rename from site/notebooks/code_samples/nlp_and_llm/datasets/sentiments.csv rename to site/notebooks/use_cases/nlp_and_llm/datasets/sentiments.csv diff --git a/site/notebooks/code_samples/nlp_and_llm/datasets/sentiments_with_predictions.csv b/site/notebooks/use_cases/nlp_and_llm/datasets/sentiments_with_predictions.csv similarity index 100% rename from site/notebooks/code_samples/nlp_and_llm/datasets/sentiments_with_predictions.csv rename to site/notebooks/use_cases/nlp_and_llm/datasets/sentiments_with_predictions.csv diff --git a/site/notebooks/code_samples/nlp_and_llm/foundation_models_integration_demo.ipynb b/site/notebooks/use_cases/nlp_and_llm/foundation_models_integration_demo.ipynb similarity index 99% rename from site/notebooks/code_samples/nlp_and_llm/foundation_models_integration_demo.ipynb rename to site/notebooks/use_cases/nlp_and_llm/foundation_models_integration_demo.ipynb index 633c99e0c0..346ecb6e1d 100644 --- a/site/notebooks/code_samples/nlp_and_llm/foundation_models_integration_demo.ipynb +++ b/site/notebooks/use_cases/nlp_and_llm/foundation_models_integration_demo.ipynb @@ -130,8 +130,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Analytics`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/code_samples/nlp_and_llm/foundation_models_summarization_demo.ipynb b/site/notebooks/use_cases/nlp_and_llm/foundation_models_summarization_demo.ipynb similarity index 100% rename from site/notebooks/code_samples/nlp_and_llm/foundation_models_summarization_demo.ipynb rename to site/notebooks/use_cases/nlp_and_llm/foundation_models_summarization_demo.ipynb diff --git a/site/notebooks/use_cases/nlp_and_llm/gen_ai_rag_template.yaml b/site/notebooks/use_cases/nlp_and_llm/gen_ai_rag_template.yaml new file mode 100644 index 0000000000..4f074f6a34 --- /dev/null +++ b/site/notebooks/use_cases/nlp_and_llm/gen_ai_rag_template.yaml @@ -0,0 +1,538 @@ +# Copyright © 2023-2026 ValidMind Inc. All rights reserved. +# Refer to the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial + +- id: conceptual_soundness + title: Conceptual Soundness + index_only: true + sections: + - id: model_overview + title: Model Overview + guidelines: + - Provide a clear and concise description of the model's main concept, + explaining the underlying financial theory or economic rationale. + - Justify the choice of the model, algorithm, or approach, relating it + to the financial institution's needs, objectives, and any relevant + industry standards. + - Discuss any alternative models or approaches considered during the + model development process, and explain why they were not selected. + - Describe any key assumptions made in the model and their potential + implications on the model's results and performance. + - Explain how the model's structure and design contribute to its + robustness, stability, and reliability in a financial context. + parent_section: conceptual_soundness + contents: + - content_id: model_overview + content_type: text + - id: intended_use_business_use_case + title: Intended Use and Business Use Case + index_only: true + condensed: true + parent_section: conceptual_soundness + sections: + - id: intended_use + title: Intended Use + parent_section: intended_use_business_use_case + guidelines: + - Clearly describe the specific business use case(s) for which the + model is designed, including the intended users and the financial + institution's objectives. + - Explain how the model fits into the overall business strategy and + decision-making processes of the financial institution. + - id: regulatory_requirements + title: Regulatory Requirements + parent_section: intended_use_business_use_case + guidelines: + - Detail any specific risks, regulatory requirements, or other + considerations associated with the model's intended use, and how + they have been addressed in the model development process. + - id: model_limitations + title: Model Limitations + parent_section: intended_use_business_use_case + guidelines: + - Discuss the model's intended scope, including any limitations, + boundaries, or exclusions in its applicability. + - Describe any potential external factors, such as economic or + market conditions, that could impact the model's performance, and + how they have been considered in the model development process. + - id: model_selection + title: Model Selection + guidelines: + - Provide a detailed description of the selected model, including its + algorithm, mathematical foundations, and key features that make it + suitable for the intended use case. + - Explain the rationale behind choosing the specific model, and how it + addresses the financial institution's objectives, regulatory + requirements, and risk management needs. + - Compare the selected model with alternative models or approaches that + were considered during the model development process, highlighting + their strengths and weaknesses, and explaining why the chosen model is + the most appropriate. + - Describe any model customizations or adaptations made to better align + it with the financial institution's needs, and discuss the potential + impact of these changes on the model's performance. + - Explain any potential limitations or drawbacks of the selected model + in the context of the intended use case, and how they have been + mitigated or addressed during the model development process. + parent_section: conceptual_soundness +- id: data_preparation + title: Data Preparation + index_only: true + sections: + - id: data_description + title: Data description + guidelines: + - Provide a comprehensive overview of the data sources used in the + model, including internal and external sources, and specify the data's + time period, frequency, and granularity. + - Describe the main variables and features used in the model, including + both input and output variables, and explain their relevance to the + model's purpose and intended use case. + - Detail any data transformations, preprocessing, or feature engineering + performed on the raw data to prepare it for model input, and explain + the rationale for these transformations. + - Address any data quality concerns, such as missing values, outliers, + or inconsistencies, and describe the methods used to handle these + issues. + - Discuss any potential biases, dependencies, or limitations in the data + that could impact the model's performance, and explain how they have + been considered or mitigated during the data preparation process. + contents: + - content_id: text_data_quality + content_type: text + options: + default_text: | + # Text Data Quality Metrics and Tests + - content_type: test + content_id: validmind.data_validation.Duplicates + - content_type: test + content_id: validmind.data_validation.nlp.StopWords + - content_type: test + content_id: validmind.data_validation.nlp.Punctuations + - content_type: test + content_id: validmind.data_validation.nlp.CommonWords + - content_type: test + content_id: validmind.data_validation.nlp.LanguageDetection + - content_type: test + content_id: validmind.data_validation.nlp.Toxicity + - content_type: test + content_id: validmind.data_validation.nlp.PolarityAndSubjectivity + - content_type: test + content_id: validmind.data_validation.nlp.Sentiment + parent_section: data_preparation + - id: feature_selection + title: Feature Selection and Engineering + guidelines: + - Describe the process used to select the most relevant features for the + model, including any feature selection techniques or criteria applied, + such as correlation analysis, mutual information, or forward/backward + selection. + - Explain the rationale behind including or excluding specific features, + and discuss their importance and contribution to the model's + performance and intended use case. + - Detail any feature engineering techniques applied to create new + features or transform existing ones, such as dimensionality reduction, + aggregation, or interaction terms, and explain their relevance and + purpose in the context of the model. + - Describe any data normalization or scaling techniques used to + standardize the input features, and explain the rationale for their + application in the model. + - Discuss potential multicollinearity, redundancy, or other issues among + the selected features, and describe the methods used to address these + concerns during the feature selection and engineering process. + parent_section: data_preparation + contents: + - content_id: feature_selection + content_type: text +- id: model_development + title: Model Development + index_only: true + sections: + - id: model_training + title: Model Training + guidelines: + - Describe the model training process, including the algorithm used, any + hyperparameters or settings, and the optimization techniques employed + to minimize the loss function or maximize the objective function. + - Detail the model validation and selection process, including the use + of cross-validation, holdout samples, or other techniques to assess + the model's performance and prevent overfitting. + - Provide a summary of the training results, including performance + metrics such as accuracy, precision, recall, F1 score, or other + relevant measures, depending on the model's intended use case. + - Discuss any challenges, issues, or trade-offs encountered during the + model training process, such as overfitting, underfitting, or class + imbalance, and explain how they were addressed or mitigated. + - Describe any tuning or optimization steps performed to improve the + model's performance, such as hyperparameter tuning, feature selection, + or other adjustments, and explain the rationale for these changes. + parent_section: model_development + contents: + - content_id: model_training + content_type: text + - id: prompt_validation + title: Prompt Validation + guidelines: + - Provide documentation of the prompt used for the model. Describe the + type of prompting used (e.g. few-shot, zero-shot, chain-of-thought + etc.). Explain how the prompt was constructed and how it was + validated. + parent_section: model_development + contents: + - content_type: test + content_id: validmind.prompt_validation.Bias + options: + title: Bias + - content_type: test + content_id: validmind.prompt_validation.Clarity + options: + title: Clarity + - content_type: test + content_id: validmind.prompt_validation.Conciseness + options: + title: Conciseness + - content_type: test + content_id: validmind.prompt_validation.Delimitation + options: + title: Delimitation + - content_type: test + content_id: validmind.prompt_validation.NegativeInstruction + options: + title: Negative Instruction + - content_type: test + content_id: validmind.prompt_validation.Specificity + options: + title: Specificity + - id: model_evaluation + title: Model Evaluation + parent_section: model_development + guidelines: + - Describe the process used to evaluate the model's performance on a + test or validation dataset that was not used during training, to + assess its generalizability and robustness. + - Present the key performance metrics for the model evaluation, such as + accuracy, precision, recall, F1 score, AUC-ROC, mean squared error, or + other relevant measures, depending on the model's intended use case. + - Provide graphical representations of the model's performance, such as + confusion matrices, ROC curves, or residual plots, to help visualize + its effectiveness and identify any areas for improvement. + - Discuss the model's performance in the context of its intended use + case, and compare it to any benchmarks, industry standards, or + alternative models, as appropriate. + - Identify any limitations, weaknesses, or areas for improvement in the + model's performance, and discuss potential strategies for addressing + these concerns in future iterations or updates. + sections: + - id: embedding_model + title: Embedding Model + parent_section: model_evaluation + guidelines: + - Embedding model guidelines + contents: + - content_type: test + content_id: validmind.model_validation.embeddings.StabilityAnalysisRandomNoise + - content_type: test + content_id: validmind.model_validation.embeddings.StabilityAnalysisSynonyms + - content_type: test + content_id: validmind.model_validation.embeddings.StabilityAnalysisTranslation + - content_type: test + content_id: validmind.model_validation.embeddings.CosineSimilarityHeatmap + - content_type: test + content_id: validmind.model_validation.embeddings.CosineSimilarityDistribution + - content_type: test + content_id: validmind.model_validation.embeddings.PCAComponentsPairwisePlots + - id: retrieval_model + title: Retrieval model + parent_section: model_evaluation + guidelines: + - Retrieval model guideline + contents: + - content_type: test + content_id: validmind.model_validation.ragas.ContextRecall + - content_type: test + content_id: validmind.model_validation.ragas.ContextEntityRecall + - content_type: test + content_id: validmind.model_validation.ragas.ContextPrecision + - content_type: test + content_id: validmind.model_validation.ragas.ContextPrecisionWithoutReference + - id: rag_model + title: RAG model + parent_section: model_evaluation + guidelines: + - RAG model guideline + contents: + - content_type: test + content_id: validmind.model_validation.ragas.SemanticSimilarity + - content_type: test + content_id: validmind.model_validation.ragas.Faithfulness + - content_type: test + content_id: validmind.model_validation.ragas.ResponseRelevancy + - content_type: test + content_id: validmind.model_validation.ragas.AnswerCorrectness + - content_type: test + content_id: validmind.model_validation.TokenDisparity + - content_type: test + content_id: validmind.model_validation.BleuScore + - content_type: test + content_id: validmind.model_validation.RougeScore + - content_type: test + content_id: validmind.model_validation.MeteorScore + - content_type: test + content_id: validmind.model_validation.ragas.AspectCritic + - content_type: test + content_id: validmind.model_validation.ToxicityScore + - content_type: test + content_id: validmind.model_validation.RegardScore +- id: monitoring_governance + title: Monitoring and Governance + index_only: true + sections: + - id: monitoring_plan + title: Monitoring Plan + guidelines: + - Describe the plan for ongoing monitoring of the model's performance, + including the frequency of evaluations, the performance metrics to be + assessed, and any thresholds or triggers for action. + - Explain the process for identifying and addressing any changes in the + model's performance or the underlying data that may require model + updates, recalibration, or retraining. + - Detail the procedures for model validation and backtesting, to ensure + the model remains accurate, reliable, and compliant with regulatory + requirements and industry standards. + - Discuss the governance framework in place to oversee the model's use, + including the roles and responsibilities of various stakeholders, such + as model developers, validators, and risk managers. + - Describe the model's documentation and version control procedures, to + ensure that changes, updates, and improvements are properly tracked + and recorded. + contents: + - content_id: monitoring_plan + content_type: text + options: + default_text: > + To ensure the ongoing effectiveness of the model, it will be + monitored on a regular basis. The monitoring plan includes the + following steps: + + + 1. **Performance Metrics Monitoring:** The key performance metrics + such as Accuracy, Precision, Recall, F1 Score, and ROC-AUC will be + + tracked on a regular basis. These metrics will be computed for + both the training, validation and test datasets to identify any + signs of + + overfitting or underperformance. + + 2. **Data Drift Monitoring:** Over time, the distribution of the + input data may change, a phenomenon known as data drift. This + could + + impact the model's performance. We will monitor the distributions + of the input features and the target variable to detect any + significant + + changes that may require retraining of the model. + + 3. **Outcome Monitoring:** The actual outcomes (whether a customer + exited the bank or not) will be compared with the model's + predictions + + to assess the model's performance in a real-world setting. + + 4. **Feature Importance Monitoring:** The importance of different + features for the model's predictions will be tracked. If there are + + significant shifts in feature importance, this could indicate + changes in the underlying patterns in the data. + + 5. **Periodic Model Retraining:** Depending on the findings from + the above monitoring activities, the model may need to be + retrained + + periodically. This will be done using the most recent data to + ensure that the model stays up-to-date with the latest patterns + and trends. + + + In terms of governance, a clear process will be put in place for + managing these monitoring activities. This will include clear + roles + + and responsibilities, documentation of the monitoring results, and + a process for deciding when and how to take action based on the + + monitoring results (for example, when to retrain the model). Any + major decisions about the model (such as changes to the model or + its + + features) will be made in a transparent and accountable manner, + with appropriate documentation and sign-off. + parent_section: monitoring_governance + - id: monitoring_implementation + title: Monitoring Implementation + guidelines: + - Describe the tools, systems, or platforms used to implement the + monitoring plan, including any relevant software, data pipelines, or + reporting tools. + - Detail the process for collecting and storing the data needed for + ongoing monitoring, including any data preprocessing, cleaning, or + transformation steps required. + - Explain the procedures for analyzing the model's performance metrics + and generating monitoring reports, including any statistical tests or + visualizations used to assess the model's performance and stability. + - Discuss the escalation process and communication channels for + reporting any significant deviations in the model's performance, as + well as the decision-making process for determining appropriate + actions, such as model updates or recalibration. + - Describe any training or educational programs in place to ensure that + relevant stakeholders, such as model developers, validators, and risk + managers, are equipped to understand, interpret, and act on the + monitoring results. + parent_section: monitoring_governance + contents: + - content_id: monitoring_implementation + content_type: text + options: + default_text: > + Implementing the monitoring plan requires a systematic approach + with clear steps and procedures. Here are the steps for + implementing + + the monitoring plan: + + + 1. **Establish Baseline Metrics:** At the outset, we'll establish + baseline performance metrics for the model. These will serve as + + reference points for future comparison. + + 2. **Automate Metric Calculation:** Performance metrics such as + Accuracy, Precision, Recall, F1 Score, and ROC-AUC will be + automatically + + calculated and recorded for both the training and validation + datasets after each run of the model. + + 3. **Set Up Data Drift Monitors:** We'll set up automated + monitoring of the distributions of the input features and the + target + + variable. Any significant changes in these distributions will + trigger alerts. + + 4. **Implement Outcome Monitoring:** We'll compare the model's + predictions with the actual outcomes on a regular basis. + Discrepancies + + will be investigated to understand the root cause. + + 5. **Track Feature Importance:** We'll keep a record of feature + importance as indicated by the model. Any significant shifts in + + feature importance over time will be thoroughly reviewed. + + 6. **Establish Retraining Protocols:** Based on the findings from + the above monitoring activities, we'll establish clear protocols + + for when and how the model should be retrained. This may include + criteria for triggering a retraining, procedures for carrying out + + the retraining, and protocols for testing and validating the + retrained model before it is put back into service. + + 7. **Document and Review:** All monitoring activities and their + results will be documented in a transparent and accessible manner. + + These documents will be reviewed regularly by a designated team or + individual to ensure that any issues are promptly identified and + addressed. + + + By implementing this monitoring plan, we can ensure that the model + continues to perform well and that any issues are quickly + + identified and addressed. The ultimate aim is to ensure that the + model continues to provide accurate and reliable predictions that + + can support the bank's decision-making processes. + - id: governance_plan + title: Governance Plan + guidelines: + - Describe the overall governance framework and processes established to + ensure proper oversight and management of the model, including the + roles and responsibilities of key stakeholders such as model + developers, validators, and risk managers. + - Detail the policies and procedures for model risk management, + including model risk identification, assessment, and mitigation + strategies. + - Explain the model approval process, including any internal or external + reviews, audits, or regulatory assessments that must be completed + before the model is put into production. + - Discuss the procedures for ongoing model maintenance, updates, and + improvements, including the documentation and version control + processes to track and record changes to the model. + - Describe the contingency plans in place to manage potential model + failures or performance issues, such as fallback models, alternative + data sources, or manual processes, and explain the criteria for + activating these contingency measures. + parent_section: monitoring_governance + contents: + - content_id: governance_plan + content_type: text + options: + default_text: > + Effective governance of the model is crucial to ensure its + reliability, security, and compliance with regulatory + requirements. + + Here is the plan for model governance: + + + 1. **Roles and Responsibilities:** Clear roles and + responsibilities will be assigned for model development, + validation, deployment, + + monitoring, and retraining. This will include a model owner, who + will have overall responsibility for the model, as well as others + + responsible for specific tasks. + + 2. **Model Documentation:** Comprehensive documentation will be + maintained for the model, including details of its development, + + validation, and performance, as well as any changes made to the + model or its inputs over time. This documentation will be updated + + regularly and will be accessible to all relevant stakeholders. + + 3. **Change Control:** Any changes to the model or its inputs will + be subject to a strict change control process, including + + documentation of the proposed change, review and approval by a + designated authority, testing and validation of the change, and a + + post-implementation review. + + 4. **Security and Access Control:** Measures will be put in place + to ensure the security of the model and its data, including + + access controls, data encryption, and regular security audits. + + 5. **Regulatory Compliance:** The model will be designed and + operated in compliance with all relevant regulatory requirements, + + and its compliance will be regularly reviewed and confirmed. + + 6. **Auditability:** The model and its operations will be + auditable, with clear and accessible records that can be reviewed + by + + internal or external auditors. + + 7. **Training and Awareness:** All individuals involved in the + development, operation, and oversight of the model will receive + + appropriate training and will be kept aware of their + responsibilities, the model's performance, and any relevant + developments or issues. diff --git a/site/notebooks/code_samples/nlp_and_llm/hugging_face_integration_demo.ipynb b/site/notebooks/use_cases/nlp_and_llm/hugging_face_integration_demo.ipynb similarity index 99% rename from site/notebooks/code_samples/nlp_and_llm/hugging_face_integration_demo.ipynb rename to site/notebooks/use_cases/nlp_and_llm/hugging_face_integration_demo.ipynb index dd7e03d056..c424355de3 100644 --- a/site/notebooks/code_samples/nlp_and_llm/hugging_face_integration_demo.ipynb +++ b/site/notebooks/use_cases/nlp_and_llm/hugging_face_integration_demo.ipynb @@ -132,8 +132,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Analytics`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/code_samples/nlp_and_llm/hugging_face_summarization_demo.ipynb b/site/notebooks/use_cases/nlp_and_llm/hugging_face_summarization_demo.ipynb similarity index 99% rename from site/notebooks/code_samples/nlp_and_llm/hugging_face_summarization_demo.ipynb rename to site/notebooks/use_cases/nlp_and_llm/hugging_face_summarization_demo.ipynb index 8209e65c2c..14816ce332 100644 --- a/site/notebooks/code_samples/nlp_and_llm/hugging_face_summarization_demo.ipynb +++ b/site/notebooks/use_cases/nlp_and_llm/hugging_face_summarization_demo.ipynb @@ -129,8 +129,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Analytics`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/code_samples/nlp_and_llm/llm_summarization_demo.ipynb b/site/notebooks/use_cases/nlp_and_llm/llm_summarization_demo.ipynb similarity index 99% rename from site/notebooks/code_samples/nlp_and_llm/llm_summarization_demo.ipynb rename to site/notebooks/use_cases/nlp_and_llm/llm_summarization_demo.ipynb index 159a16e79d..fe6e879568 100644 --- a/site/notebooks/code_samples/nlp_and_llm/llm_summarization_demo.ipynb +++ b/site/notebooks/use_cases/nlp_and_llm/llm_summarization_demo.ipynb @@ -163,8 +163,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Sales/Prospecting`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/code_samples/nlp_and_llm/prompt_validation_demo.ipynb b/site/notebooks/use_cases/nlp_and_llm/prompt_validation_demo.ipynb similarity index 98% rename from site/notebooks/code_samples/nlp_and_llm/prompt_validation_demo.ipynb rename to site/notebooks/use_cases/nlp_and_llm/prompt_validation_demo.ipynb index 49633b3a99..ec70da33c8 100644 --- a/site/notebooks/code_samples/nlp_and_llm/prompt_validation_demo.ipynb +++ b/site/notebooks/use_cases/nlp_and_llm/prompt_validation_demo.ipynb @@ -94,7 +94,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -159,8 +159,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Analytics`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/use_cases/nlp_and_llm/rag_benchmark_demo.ipynb b/site/notebooks/use_cases/nlp_and_llm/rag_benchmark_demo.ipynb new file mode 100644 index 0000000000..51515ad7bb --- /dev/null +++ b/site/notebooks/use_cases/nlp_and_llm/rag_benchmark_demo.ipynb @@ -0,0 +1,1869 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAG Model Benchmarking Demo\n", + "\n", + "In this notebook, we are going to implement a simple RAG Model for automating the process of answering RFP questions using GenAI. We will see how we can initialize an embedding model, a retrieval model and a generator model with LangChain components and use them within the ValidMind Library to run tests against them. We'll demonstrate how to set up multiple models for benchmarking at each stage of the RAG pipeline - specifically two embedding models, two retrieval models with different parameters, and two LLM models (GPT-3.5 and GPT-4o) - allowing for comparison of performance across different configurations. Finally, we will see how we can put them together in a Pipeline and run that to get e2e results and run tests against that." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1__) \n", + " - [Before you begin](#toc1_1__) \n", + " - [New to ValidMind?](#toc1_2__) \n", + " - [Key concepts](#toc1_3__) \n", + "- [Setting up](#toc2__) \n", + " - [Install the ValidMind Library](#toc2_1__) \n", + " - [Initialize the ValidMind Library](#toc2_2__) \n", + " - [Register sample model](#toc2_2_1__) \n", + " - [Apply documentation template](#toc2_2_2__) \n", + " - [Get your code snippet](#toc2_2_3__) \n", + "- [Read Open AI API Key](#toc3__) \n", + "- [Dataset Loader](#toc4__) \n", + "- [Data validation](#toc5__) \n", + " - [Duplicates](#toc5_1__) \n", + " - [Stop Words](#toc5_2__) \n", + " - [Punctuations](#toc5_3__) \n", + " - [Common Words](#toc5_4__) \n", + " - [Language Detection](#toc5_5__) \n", + " - [Toxicity Score](#toc5_6__) \n", + " - [Polarity and Subjectivity](#toc5_7__) \n", + " - [Sentiment](#toc5_8__) \n", + " - [Assign Predictions](#toc5_9__) \n", + " - [Run tests](#toc5_10__) \n", + " - [Generate embeddings for the Train Set](#toc5_11__) \n", + " - [Insert embeddings and questions into Vector DB](#toc5_12__) \n", + "- [Prompt Evaluation](#toc6__) \n", + "- [RAGAS evaluation](#toc7__) \n", + " - [Semantic Similarity](#toc7_1__) \n", + " - [Context Entity Recall](#toc7_2__) \n", + " - [Context Precision](#toc7_3__) \n", + " - [Context Precision Without Reference](#toc7_4__) \n", + " - [Faithfulness](#toc7_5__) \n", + " - [Response Relevancy](#toc7_6__) \n", + " - [Context Recall](#toc7_7__) \n", + " - [Answer Correctness](#toc7_8__) \n", + " - [Aspect Critic](#toc7_9__) \n", + " - [Noise Sensitivity](#toc7_10__) \n", + "- [Generation quality](#toc8__) \n", + " - [Token Disparity](#toc8_1__) \n", + " - [ROUGE Score](#toc8_2__) \n", + " - [BLEU Score](#toc8_3__) \n", + " - [BERT Score](#toc8_4__) \n", + " - [METEOR Score](#toc8_5__) \n", + "- [Bias and Toxicity](#toc9__) \n", + " - [Toxicity Score](#toc9_1__) \n", + " - [Regard Score](#toc9_2__) \n", + "- [Upgrade ValidMind](#toc10__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "- **FunctionModels**: ValidMind offers support for creating `VMModel` instances from Python functions. This enables us to support any \"model\" by simply using the provided function as the model's `predict` method.\n", + "- **PipelineModels**: ValidMind models (`VMModel` instances) of any type can be piped together to create a model pipeline. This allows model components to be created and tested/documented independently, and then combined into a single model for end-to-end testing and documentation. We use the `|` operator to pipe models together.\n", + "- **RAG**: RAG stands for Retrieval Augmented Generation and refers to a wide range of GenAI applications where some form of retrieval is used to add context to the prompt so that the LLM that generates content can refer to it when creating its output. In this notebook, we are going to implement a simple RAG setup using LangChain components." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prerequisites\n", + "\n", + "Let's go ahead and install the `validmind` library if its not already installed... Then we can install the `qdrant-client` library for our vector store and `langchain` for everything else:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q \"validmind[llm]\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q qdrant-client langchain langchain-openai sentencepiece" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "
Recommended Python versions\n", + "

\n", + "Python 3.8 <= x <= 3.11
\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Gen AI RAG`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Can't select this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", + " api_key = \"...\",\n", + " api_secret = \"...\",\n", + " model = \"...\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Read Open AI API Key\n", + "\n", + "We will need to have an OpenAI API key to be able to use their `text-embedding-3-small` and `text-embedding-3-large` models for our embeddings, `gpt-3.5-turbo` and `gpt-4o` models for our generator and `gpt-4o` model for our LLM-as-Judge tests. If you don't have an OpenAI API key, you can get one by signing up at [OpenAI](https://platform.openai.com/signup). Then you can create a `.env` file in the root of your project and the following cell will load it from there. Alternatively, you can just uncomment the line below to directly set the key (not recommended for security reasons)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load openai api key\n", + "import os\n", + "\n", + "import dotenv\n", + "import nltk\n", + "\n", + "dotenv.load_dotenv()\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt_tab')\n", + "\n", + "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", + "\n", + "if not \"OPENAI_API_KEY\" in os.environ:\n", + " raise ValueError(\"OPENAI_API_KEY is not set\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Dataset Loader\n", + "\n", + "Great, now that we have all of our dependencies installed, the ValidMind Library initialized and connected to our model and our OpenAI API key setup, we can go ahead and load our datasets. We will use the synthetic `RFP` dataset included with ValidMind for this notebook. This dataset contains a variety of RFP questions and ground truth answers that we can use both as the source where our Retriever will search for similar question-answer pairs as well as our test set for evaluating the performance of our RAG model. To do this, we just have to load it and call the preprocess function to get a split of the data into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the sample dataset from the library\n", + "from validmind.datasets.llm.rag import rfp\n", + "\n", + "raw_df = rfp.load_data()\n", + "train_df, test_df = rfp.preprocess(raw_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds = vm.init_dataset(\n", + " train_df,\n", + " text_column=\"question\",\n", + " target_column=\"ground_truth\",\n", + ")\n", + "\n", + "vm_test_ds = vm.init_dataset(\n", + " test_df,\n", + " text_column=\"question\",\n", + " target_column=\"ground_truth\",\n", + ")\n", + "\n", + "vm_test_ds.df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Data validation\n", + "\n", + "Now that we have loaded our dataset, we can go ahead and run some data validation tests right away to start assessing and documenting the quality of our data. Since we are using a text dataset, we can use ValidMind's built-in array of text data quality tests to check that things like number of duplicates, missing values, and other common text data issues are not present in our dataset. We can also run some tests to check the sentiment and toxicity of our data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Duplicates\n", + "\n", + "First, let's check for duplicates in our dataset. We can use the `validmind.data_validation.Duplicates` test and pass our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import run_test\n", + "\n", + "run_test(\n", + " test_id=\"validmind.data_validation.Duplicates\",\n", + " inputs={\"dataset\": vm_train_ds},\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Stop Words\n", + "\n", + "Next, let's check for stop words in our dataset. We can use the `validmind.data_validation.StopWords` test and pass our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.StopWords\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Punctuations\n", + "\n", + "Next, let's check for punctuations in our dataset. We can use the `validmind.data_validation.Punctuations` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.Punctuations\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Common Words\n", + "\n", + "Next, let's check for common words in our dataset. We can use the `validmind.data_validation.CommonWord` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.CommonWords\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Language Detection\n", + "\n", + "For documentation purposes, we can detect and log the languages used in the dataset with the `validmind.data_validation.LanguageDetection` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.LanguageDetection\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Toxicity Score\n", + "\n", + "Now, let's go ahead and run the `validmind.data_validation.nlp.Toxicity` test to compute a toxicity score for our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.Toxicity\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Polarity and Subjectivity\n", + "\n", + "We can also run the `validmind.data_validation.nlp.PolarityAndSubjectivity` test to compute the polarity and subjectivity of our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.PolarityAndSubjectivity\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Sentiment\n", + "\n", + "Finally, we can run the `validmind.data_validation.nlp.Sentiment` test to plot the sentiment of our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.Sentiment\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Embedding Model\n", + "\n", + "Now that we have our dataset loaded and have run some data validation tests to assess and document the quality of our data, we can go ahead and initialize our embedding model. We will use `text-embedding-3-small` and `text-embedding-3-large` models from OpenAI for this purpose wrapped in the `OpenAIEmbeddings` class from LangChain. This model will be used to \"embed\" our questions both for inserting the question-answer pairs from the \"train\" set into the vector store and for embedding the question from inputs when making predictions with our RAG model." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embedding_small_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", + "\n", + "\n", + "def embed_small(input):\n", + " \"\"\"Returns a text embedding for the given text\"\"\"\n", + " return embedding_small_client.embed_query(input[\"question\"])\n", + "\n", + "\n", + "vm_embedder_small = vm.init_model(input_id=\"embedding_small_model\", predict_fn=embed_small)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_large_client = OpenAIEmbeddings(model=\"text-embedding-3-large\")\n", + "\n", + "\n", + "def embed_large(input):\n", + " \"\"\"Returns a text embedding for the given text\"\"\"\n", + " return embedding_large_client.embed_query(input[\"question\"])\n", + "\n", + "\n", + "vm_embedder_large = vm.init_model(input_id=\"embedding_large_model\", predict_fn=embed_large)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What we have done here is to initialize the `OpenAIEmbeddings` class so it uses OpenAI's `text-embedding-3-small` and `text-embedding-3-large` models. We then created an `embed` function that takes in an `input` dictionary and uses the `embed_query` method of the embedding client to compute the embeddings of the `question`. We use an `embed` function since that is how ValidMind supports any custom model. We will use this strategy for the retrieval and generator models as well but you could also use, say, a HuggingFace model directly. See the documentation for more information on which model types are directly supported - [ValidMind Documentation](https://docs.validmind.ai/validmind/validmind.html)... Finally, we use the `init_model` function from the ValidMind Library to create a `VMModel` object that can be used in ValidMind tests. This also logs the model to our model documentation and any test that uses the model will be linked to the logged model and its metadata." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign Predictions\n", + "\n", + "To precompute the embeddings for our test set, we can call the `assign_predictions` method of our `vm_test_ds` object we created above. This will compute the embeddings for each question in the test set and store them in the a special prediction column of the test set thats linked to our `vm_embedder` model. This will allow us to use these embeddings later when we run tests against our embedding model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(vm_embedder_small)\n", + "vm_test_ds.assign_predictions(vm_embedder_large)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run tests\n", + "\n", + "Now that everything is setup for the embedding model, we can go ahead and run some tests to assess and document the quality of our embeddings. We will use the `validmind.model_validation.embeddings.*` tests to compute a variety of metrics against our model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisRandomNoise\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + " params={\n", + " \"probability\": 0.3,\n", + " \"mean_similarity_threshold\": 0.7,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisSynonyms\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + " params={\n", + " \"probability\": 0.3,\n", + " \"mean_similarity_threshold\": 0.7,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisTranslation\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + " params={\n", + " \"source_lang\": \"en\",\n", + " \"target_lang\": \"fr\",\n", + " \"mean_similarity_threshold\": 0.7,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.CosineSimilarityHeatmap\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.CosineSimilarityDistribution\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.PCAComponentsPairwisePlots\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + " params={\n", + " \"n_components\": 3,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup Vector Store\n", + "\n", + "Great, so now that we have assessed our embedding model and verified that it is performing well, we can go ahead and use it to compute embeddings for our question-answer pairs in the \"train\" set. We will then use these embeddings to insert the question-answer pairs into a vector store. We will use an in-memory `qdrant` vector database for demo purposes but any option would work just as well here. We will use the `QdrantClient` class from LangChain to interact with the vector store. This class will allow us to insert and search for embeddings in the vector store." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Generate embeddings for the Train Set\n", + "\n", + "We can use the same `assign_predictions` method from earlier except this time we will use the `vm_train_ds` object to compute the embeddings for the question-answer pairs in the \"train\" set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds.assign_predictions(vm_embedder_small)\n", + "print(vm_train_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Insert embeddings and questions into Vector DB\n", + "\n", + "Now that we have computed the embeddings for our question-answer pairs in the \"train\" set, we can go ahead and insert them into the vector store:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.vectorstores import Qdrant\n", + "from langchain_community.document_loaders import DataFrameLoader\n", + "\n", + "# load documents from dataframe\n", + "loader = DataFrameLoader(train_df, page_content_column=\"question\")\n", + "docs = loader.load()\n", + "\n", + "# setup vector datastore\n", + "qdrant = Qdrant.from_documents(\n", + " docs,\n", + " embedding_small_client,\n", + " location=\":memory:\", # Local mode with in-memory storage only\n", + " collection_name=\"rfp_rag_collection\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Retrieval Model\n", + "\n", + "Now that we have an embedding model and a vector database setup and loaded with our data, we need a Retrieval model that can search for similar question-answer pairs for a given input question. Once created, we can initialize this as a ValidMind model and `assign_predictions` to it just like our embedding model. In this example, we'll create two retrieval models with different `k` parameters (the number of documents retrieved) to benchmark and compare their performance. This approach allows us to evaluate how retrieval depth affects the overall system quality." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def retrieve(input):\n", + " contexts = []\n", + "\n", + " for result in qdrant.similarity_search_with_score(input[\"question\"], k=5):\n", + " document, score = result\n", + " context = f\"Q: {document.page_content}\\n\"\n", + " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", + "\n", + " contexts.append(context)\n", + "\n", + " return contexts\n", + "\n", + "\n", + "vm_retriever_k5 = vm.init_model(input_id=\"retrieval_k5_model\", predict_fn=retrieve)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def retrieve(input):\n", + " contexts = []\n", + "\n", + " for result in qdrant.similarity_search_with_score(input[\"question\"], k=10):\n", + " document, score = result\n", + " context = f\"Q: {document.page_content}\\n\"\n", + " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", + "\n", + " contexts.append(context)\n", + "\n", + " return contexts\n", + "\n", + "\n", + "vm_retriever_k10 = vm.init_model(input_id=\"retrieval_k10_model\", predict_fn=retrieve)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(model=vm_retriever_k5)\n", + "vm_test_ds.assign_predictions(model=vm_retriever_k10)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds._df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generation Model\n", + "\n", + "As the final piece of this simple RAG pipeline, we can create and initialize a generation model that will use the retrieved context to generate an answer to the input question. We will use the `gpt-3.5-turbo` and `gpt-4o` models from OpenAI. Since we have two retrieval models (with different `k` values) and want to test two different LLMs, we'll create a total of four generator models - pairing each retrieval configuration with each LLM to comprehensively evaluate how both retrieval depth and model capability affect response quality." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI\n", + "\n", + "from validmind.models import Prompt\n", + "\n", + "\n", + "system_prompt = \"\"\"\n", + "You are an expert RFP AI assistant.\n", + "You are tasked with answering new RFP questions based on existing RFP questions and answers.\n", + "You will be provided with the existing RFP questions and answer pairs that are the most relevant to the new RFP question.\n", + "After that you will be provided with a new RFP question.\n", + "You will generate an answer and respond only with the answer.\n", + "Ignore your pre-existing knowledge and answer the question based on the provided context.\n", + "\"\"\".strip()\n", + "\n", + "openai_client = OpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(input):\n", + " \n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k5_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + " \n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator_k5_gpt35 = vm.init_model(\n", + " input_id=\"generation_k5_gpt35_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(input):\n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k10_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + "\n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator_k10_gpt35 = vm.init_model(\n", + " input_id=\"generation_k10_gpt35_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(input):\n", + " \n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-4o\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k5_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + " \n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator_k5_gpt4o = vm.init_model(\n", + " input_id=\"generation_k5_gpt4o_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(input):\n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-4o\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k10_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + "\n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator_k10_gpt4o = vm.init_model(\n", + " input_id=\"generation_k10_gpt4o_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's test it out real quick:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "vm_generator_k5_gpt35.predict(\n", + " pd.DataFrame(\n", + " {\"retrieval_k5_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_generator_k5_gpt4o.predict(\n", + " pd.DataFrame(\n", + " {\"retrieval_k5_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Prompt Evaluation\n", + "\n", + "Now that we have our generator model initialized, we can run some LLM-as-Judge tests to evaluate the system prompt. This will allow us to get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", + "\n", + "- **Examplar Bias**: When using multi-shot prompting, does the prompt contain an unbiased distribution of examples?\n", + "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", + "- **Clarity**: How clearly the prompt states the task.\n", + "- **Conciseness**: How succinctly the prompt states the task.\n", + "- **Instruction Framing**: Whether the prompt contains negative instructions.\n", + "- **Specificity**: How specific the prompt defines the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Bias\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Clarity\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Conciseness\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Delimitation\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.NegativeInstruction\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Specificity\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup RAG Pipeline Model\n", + "\n", + "Now that we have all of our individual \"component\" models setup and initialized we need some way to put them all together in a single \"pipeline\". We can use the `PipelineModel` class to do this. This ValidMind model type simply wraps any number of other ValidMind models and runs them in sequence. We can use a pipe(`|`) operator - in Python this is normally an `or` operator but we have overloaded it for easy pipeline creation - to chain together our models. We can then initialize this pipeline model and assign predictions to it just like any other model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_rag_k5_gpt35_model = vm.init_model(vm_retriever_k5 | vm_generator_k5_gpt35, input_id=\"rag_k5_gpt35_model\")\n", + "vm_rag_k10_gpt35_model = vm.init_model(vm_retriever_k10 | vm_generator_k10_gpt35, input_id=\"rag_k10_gpt35_model\")\n", + "vm_rag_k5_gpt4o_model = vm.init_model(vm_retriever_k5 | vm_generator_k5_gpt4o, input_id=\"rag_k5_gpt4o_model\")\n", + "vm_rag_k10_gpt4o_model = vm.init_model(vm_retriever_k10 | vm_generator_k10_gpt4o, input_id=\"rag_k10_gpt4o_model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can `assign_predictions` to the pipeline model just like we did with the individual models. This will run the pipeline on the test set and store the results in the test set for later use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(model=vm_rag_k5_gpt35_model)\n", + "vm_test_ds.assign_predictions(model=vm_rag_k10_gpt35_model)\n", + "vm_test_ds.assign_predictions(model=vm_rag_k5_gpt4o_model)\n", + "vm_test_ds.assign_predictions(model=vm_rag_k10_gpt4o_model)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds._df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run tests" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## RAGAS evaluation\n", + "\n", + "Let's go ahead and run some of our new RAG tests against our model...\n", + "\n", + "> Note: these tests are still being developed and are not yet in a stable state. We are using advanced tests here that use LLM-as-Judge and other strategies to assess things like the relevancy of the retrieved context to the input question and the correctness of the generated answer when compared to the ground truth. There is more to come in this area so stay tuned!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Semantic Similarity\n", + "\n", + "The concept of Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated answer and the ground truth. This evaluation is based on the ground truth and the answer, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated answer and the ground truth.\n", + "\n", + "Measuring the semantic similarity between answers can offer valuable insights into the quality of the generated response. This evaluation utilizes a cross-encoder model to calculate the semantic similarity score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.SemanticSimilarity\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Entity Recall\n", + "\n", + "This test gives the measure of recall of the retrieved context, based on the number of entities present in both ground_truths and contexts relative to the number of entities present in the ground_truths alone. Simply put, it is a measure of what fraction of entities are recalled from ground_truths. This test is useful in fact-based use cases like tourism help desk, historical QA, etc. This test can help evaluate the retrieval mechanism for entities, based on comparison with entities present in ground_truths, because in cases where entities matter, we need the contexts which cover them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextEntityRecall\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"reference_column\": [\"ground_truth\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Precision\n", + "\n", + "Context Precision is a test that evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This test is computed using the question, ground_truth and the contexts, with values ranging between 0 and 1, where higher scores indicate better precision." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecision\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Precision Without Reference\n", + "\n", + "This test evaluates whether retrieved contexts align well with the expected response for a given user input, without requiring a ground-truth reference. This test assesses the relevance of each retrieved context chunk by comparing it directly to the response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid=[\n", + " {\"user_input_column\": \"question\",\n", + " \"retrieved_contexts_column\": \"retrieval_k5_model_prediction\",\n", + " \"response_column\": \"rag_k5_gpt4o_model_prediction\"\n", + " },\n", + " {\"user_input_column\": \"question\",\n", + " \"retrieved_contexts_column\": \"retrieval_k10_model_prediction\",\n", + " \"response_column\": \"rag_k10_gpt4o_model_prediction\"\n", + " },\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Faithfulness\n", + "\n", + "This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. The answer is scaled to (0,1) range. Higher the better.\n", + "\n", + "The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.Faithfulness\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Response Relevancy\n", + "\n", + "The Response Relevancy test, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information and higher scores indicate better relevancy. This test is computed using the question, the context and the answer.\n", + "\n", + "The Response Relevancy is defined as the mean cosine similartiy of the original question to a number of artifical questions, which where generated (reverse engineered) based on the answer.\n", + "\n", + "Please note, that eventhough in practice the score will range between 0 and 1 most of the time, this is not mathematically guranteed, due to the nature of the cosine similarity ranging from -1 to 1.\n", + "\n", + "> Note: This is a reference free test. If you’re looking to compare ground truth answer with generated answer refer to Answer Correctness.\n", + "\n", + "An answer is deemed relevant when it directly and appropriately addresses the original question. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the answer lacks completeness or contains redundant details. To calculate this score, the LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured. The underlying idea is that if the generated answer accurately addresses the initial question, the LLM should be able to generate questions from the answer that align with the original question." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Recall\n", + "\n", + "Context recall measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. It is computed based on the ground truth and the retrieved context, and the values range between 0 and 1, with higher values indicating better performance.\n", + "\n", + "To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextRecall\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Answer Correctness\n", + "\n", + "The assessment of Answer Correctness involves gauging the accuracy of the generated answer when compared to the ground truth. This evaluation relies on the ground truth and the answer, with scores ranging from 0 to 1. A higher score indicates a closer alignment between the generated answer and the ground truth, signifying better correctness.\n", + "\n", + "Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score.\n", + "\n", + "Factual correctness quantifies the factual overlap between the generated answer and the ground truth answer. This is done using the concepts of:\n", + "\n", + "- TP (True Positive): Facts or statements that are present in both the ground truth and the generated answer.\n", + "- FP (False Positive): Facts or statements that are present in the generated answer but not in the ground truth.\n", + "- FN (False Negative): Facts or statements that are present in the ground truth but not in the generated answer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.AnswerCorrectness\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Aspect Critic\n", + "\n", + "This is designed to assess submissions based on predefined aspects such as harmlessness and correctness. Additionally, users have the flexibility to define their own aspects for evaluating submissions according to their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not. This evaluation is performed using the ‘answer’ as input.\n", + "\n", + "Critiques within the LLM evaluators evaluate submissions based on the provided aspect. Ragas Critiques offers a range of predefined aspects like correctness, harmfulness, etc. Users can also define their own aspects for evaluating submissions based on their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.AspectCritic\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Noise Sensitivity\n", + "\n", + "This test is designed to evaluate the robustness of the RAG pipeline model against noise in the retrieved context. It works by checking how well the \"claims\" in the generated answer match up with the \"claims\" in the ground truth answer. If the generated answer contains \"claims\" from the contexts that the ground truth answer does not contain, those claims are considered incorrect. The score for each answer is the number of incorrect claims divided by the total number of claims. This *can* be interpreted as a measure of how sensitive the LLM is to \"noise\" in the context where \"noise\" is information that is relevant but should not be included in the answer since the ground truth answer does not contain it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.NoiseSensitivity\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Generation quality\n", + "\n", + "In this section, we evaluate the alignment and relevance of generated responses to reference outputs within our retrieval-augmented generation (RAG) application. We use metrics that assess various quality dimensions of the generated responses, including semantic similarity, structural alignment, and phrasing overlap. Semantic similarity metrics compare embeddings of generated and reference text to capture deeper contextual alignment, while overlap and alignment measures quantify how well the phrasing and structure of generated responses match the intended outputs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Token Disparity\n", + "\n", + "This test assesses the difference in token counts between the reference texts (ground truth) and the answers generated by the RAG model. It helps evaluate how well the model's outputs align with the expected length and level of detail in the reference texts. A significant disparity in token counts could signal issues with generation quality, such as excessive verbosity or insufficient detail. Consistently low token counts in generated answers compared to references might suggest that the model’s outputs are incomplete or overly concise, missing important contextual information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.TokenDisparity\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### ROUGE Score\n", + "\n", + "This test evaluates the quality of answers generated by the RAG model by measuring overlaps in n-grams, word sequences, and word pairs between the model output and the reference (ground truth) text. ROUGE, short for Recall-Oriented Understudy for Gisting Evaluation, assesses both precision and recall, providing a balanced view of how well the generated response captures the reference content. ROUGE precision measures the proportion of n-grams in the generated text that match the reference, highlighting relevance and conciseness, while ROUGE recall assesses the proportion of reference n-grams present in the generated text, indicating completeness and thoroughness. \n", + "\n", + "Low precision scores might reveal that the generated text includes redundant or irrelevant information, while low recall scores suggest omissions of essential details from the reference. Consistently low ROUGE scores could indicate poor overall alignment with the ground truth, suggesting the model may be missing key content or failing to capture the intended meaning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.RougeScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + " params={\n", + " \"metric\": \"rouge-1\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### BLEU Score\n", + "\n", + "The BLEU Score test evaluates the quality of answers generated by the RAG model by measuring n-gram overlap between the generated text and the reference (ground truth) text, with a specific focus on exact precision in phrasing. While ROUGE precision also assesses overlap, BLEU differs in two main ways: first, it applies a geometric average across multiple n-gram levels, capturing precise phrase alignment, and second, it includes a brevity penalty to prevent overly short outputs from inflating scores artificially. This added precision focus is valuable in RAG applications where strict adherence to reference language is essential, as BLEU emphasizes the match to exact phrasing. In contrast, ROUGE precision evaluates general content overlap without penalizing brevity, offering a broader sense of content alignment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.BleuScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### BERT Score\n", + "\n", + "This test evaluates the quality of the RAG generated answers using BERT embeddings to measure precision, recall, and F1 scores based on semantic similarity, rather than exact n-gram matches as in BLEU and ROUGE. This approach captures contextual meaning, making it valuable when wording differs but the intended message closely aligns with the reference. In RAG applications, the BERT score is especially useful for ensuring that generated answers convey the reference text’s meaning, even if phrasing varies. Consistently low scores indicate a lack of semantic alignment, suggesting the model may miss or misrepresent key content. Low precision may reflect irrelevant or redundant details, while low recall can indicate omissions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.BertScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### METEOR Score\n", + "\n", + "This test evaluates the quality of the generated answers by measuring alignment with the ground truth, emphasizing both accuracy and fluency. Unlike BLEU and ROUGE, which focus on n-gram matches, METEOR combines precision, recall, synonym matching, and word order, focusing at how well the generated text conveys meaning and reads naturally. This metric is especially useful for RAG applications where sentence structure and natural flow are crucial for clear communication. Lower scores may suggest alignment issues, indicating that the answers may lack fluency or key content. Discrepancies in word order or high fragmentation penalties can reveal problems with how the model constructs sentences, potentially affecting readability." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.MeteorScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Bias and Toxicity\n", + "\n", + "In this section, we use metrics like Toxicity Score and Regard Score to evaluate both the generated responses and the ground truth. These tests helps us detect any harmful, offensive, or inappropriate language and evaluate the level of bias and neutrality enabling us to assess and mitigate potential biases in both the model's responses and the original dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Toxicity Score\n", + "\n", + "This test measures the level of harmful or offensive content in the generated answers. The test uses a preloaded toxicity detection tool from Hugging Face, which identifies language that may be inappropriate, aggressive, or derogatory. High toxicity scores indicate potentially toxic content, while consistently elevated scores across multiple outputs may signal underlying issues in the model’s generation process that require attention to prevent the spread of harmful language." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ToxicityScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Regard Score\n", + "\n", + "This test evaluates the sentiment and perceived regard—categorized as positive, negative, neutral, or other—in answers generated by the RAG model. This is important for identifying any biases or sentiment tendencies in responses, ensuring that generated answers are balanced and appropriate for the context. The uses a preloaded regard evaluation tool from Hugging Face to compute scores for each response. High skewness in regard scores, especially if the generated responses consistently diverge from expected sentiments in the reference texts, may reveal biases in the model’s generation, such as overly positive or negative tones where neutrality is expected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.RegardScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "In this notebook, we have seen how we can use LangChain and ValidMind together to build, evaluate and document a simple RAG Model as its developed. This is a great example of the interactive development experience that ValidMind is designed to support. We can quickly iterate on our model and document as we go... We have seen how ValidMind supports non-traditional \"models\" using a functional interface and how we can build pipelines of many models to support complex GenAI workflows.\n", + "\n", + "This is still a work in progress and we are actively developing new tests to support more advanced GenAI workflows. We are also keeping an eye on the most popular GenAI models and libraries to explore direct integrations. Stay tuned for more updates and new features in this area!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-09e315440ca84258abe1aaefaca3a3d0", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ValidMind Library", + "language": "python", + "name": "validmind" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/use_cases/nlp_and_llm/rag_documentation_demo.ipynb b/site/notebooks/use_cases/nlp_and_llm/rag_documentation_demo.ipynb new file mode 100644 index 0000000000..aac1876d5d --- /dev/null +++ b/site/notebooks/use_cases/nlp_and_llm/rag_documentation_demo.ipynb @@ -0,0 +1,1692 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAG Model Documentation Demo\n", + "\n", + "In this notebook, we are going to implement a simple RAG Model for automating the process of answering RFP questions using GenAI. We will see how we can initialize an embedding model, a retrieval model and a generator model with LangChain components and use them within the ValidMind Library to run tests against them. Finally, we will see how we can put them together in a Pipeline and run that to get e2e results and run tests against that." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1__) \n", + " - [Before you begin](#toc1_1__) \n", + " - [New to ValidMind?](#toc1_2__) \n", + " - [Key concepts](#toc1_3__) \n", + "- [Setting up](#toc2__) \n", + " - [Initialize the ValidMind Library](#toc2_1__) \n", + " - [Register sample model](#toc2_1_1__) \n", + " - [Apply documentation template](#toc2_1_2__) \n", + " - [Get your code snippet](#toc2_1_3__) \n", + "- [Read Open AI API Key](#toc3__) \n", + "- [Dataset Loader](#toc4__) \n", + "- [Data validation](#toc5__) \n", + " - [Duplicates](#toc5_1__) \n", + " - [Stop Words](#toc5_2__) \n", + " - [Punctuations](#toc5_3__) \n", + " - [Common Words](#toc5_4__) \n", + " - [Language Detection](#toc5_5__) \n", + " - [Toxicity Score](#toc5_6__) \n", + " - [Polarity and Subjectivity](#toc5_7__) \n", + " - [Sentiment](#toc5_8__) \n", + " - [Assign Predictions](#toc5_9__) \n", + " - [Run tests](#toc5_10__) \n", + " - [Generate embeddings for the Train Set](#toc5_11__) \n", + " - [Insert embeddings and questions into Vector DB](#toc5_12__) \n", + "- [Prompt Evaluation](#toc6__) \n", + "- [RAGAS evaluation](#toc7__) \n", + " - [Semantic Similarity](#toc7_1__) \n", + " - [Context Entity Recall](#toc7_2__) \n", + " - [Context Precision](#toc7_3__) \n", + " - [Context Precision Without Reference](#toc7_4__) \n", + " - [Faithfulness](#toc7_5__) \n", + " - [Response Relevancy](#toc7_6__) \n", + " - [Context Recall](#toc7_7__) \n", + " - [Answer Correctness](#toc7_8__) \n", + " - [Aspect Critic](#toc7_9__) \n", + " - [Noise Sensitivity](#toc7_10__) \n", + "- [Generation quality](#toc8__) \n", + " - [Token Disparity](#toc8_1__) \n", + " - [ROUGE Score](#toc8_2__) \n", + " - [BLEU Score](#toc8_3__) \n", + " - [BERT Score](#toc8_4__) \n", + " - [METEOR Score](#toc8_5__) \n", + "- [Bias and Toxicity](#toc9__) \n", + " - [Toxicity Score](#toc9_1__) \n", + " - [Regard Score](#toc9_2__) \n", + "- [Upgrade ValidMind](#toc10__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "- **FunctionModels**: ValidMind offers support for creating `VMModel` instances from Python functions. This enables us to support any \"model\" by simply using the provided function as the model's `predict` method.\n", + "- **PipelineModels**: ValidMind models (`VMModel` instances) of any type can be piped together to create a model pipeline. This allows model components to be created and tested/documented independently, and then combined into a single model for end-to-end testing and documentation. We use the `|` operator to pipe models together.\n", + "- **RAG**: RAG stands for Retrieval Augmented Generation and refers to a wide range of GenAI applications where some form of retrieval is used to add context to the prompt so that the LLM that generates content can refer to it when creating its output. In this notebook, we are going to implement a simple RAG setup using LangChain components." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prerequisites\n", + "\n", + "Let's go ahead and install the `validmind` library if its not already installed... Then we can install the `qdrant-client` library for our vector store and `langchain` for everything else:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q qdrant-client langchain langchain-openai sentencepiece" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Gen AI RAG`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Can't select this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Read Open AI API Key\n", + "\n", + "We will need to have an OpenAI API key to be able to use their `text-embedding-3-small` model for our embeddings, `gpt-3.5-turbo` model for our generator and `gpt-4o` model for our LLM-as-Judge tests. If you don't have an OpenAI API key, you can get one by signing up at [OpenAI](https://platform.openai.com/signup). Then you can create a `.env` file in the root of your project and the following cell will load it from there. Alternatively, you can just uncomment the line below to directly set the key (not recommended for security reasons)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load openai api key\n", + "import os\n", + "\n", + "import dotenv\n", + "import nltk\n", + "\n", + "dotenv.load_dotenv()\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt_tab')\n", + "\n", + "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", + "\n", + "if not \"OPENAI_API_KEY\" in os.environ:\n", + " raise ValueError(\"OPENAI_API_KEY is not set\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Dataset Loader\n", + "\n", + "Great, now that we have all of our dependencies installed, the ValidMind Library initialized and connected to our model and our OpenAI API key setup, we can go ahead and load our datasets. We will use the synthetic `RFP` dataset included with ValidMind for this notebook. This dataset contains a variety of RFP questions and ground truth answers that we can use both as the source where our Retriever will search for similar question-answer pairs as well as our test set for evaluating the performance of our RAG model. To do this, we just have to load it and call the preprocess function to get a split of the data into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the sample dataset from the library\n", + "from validmind.datasets.llm.rag import rfp\n", + "\n", + "raw_df = rfp.load_data()\n", + "train_df, test_df = rfp.preprocess(raw_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds = vm.init_dataset(\n", + " train_df,\n", + " text_column=\"question\",\n", + " target_column=\"ground_truth\",\n", + ")\n", + "\n", + "vm_test_ds = vm.init_dataset(\n", + " test_df,\n", + " text_column=\"question\",\n", + " target_column=\"ground_truth\",\n", + ")\n", + "\n", + "vm_test_ds.df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Data validation\n", + "\n", + "Now that we have loaded our dataset, we can go ahead and run some data validation tests right away to start assessing and documenting the quality of our data. Since we are using a text dataset, we can use ValidMind's built-in array of text data quality tests to check that things like number of duplicates, missing values, and other common text data issues are not present in our dataset. We can also run some tests to check the sentiment and toxicity of our data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Duplicates\n", + "\n", + "First, let's check for duplicates in our dataset. We can use the `validmind.data_validation.Duplicates` test and pass our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import run_test\n", + "\n", + "run_test(\n", + " test_id=\"validmind.data_validation.Duplicates\",\n", + " inputs={\"dataset\": vm_train_ds},\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Stop Words\n", + "\n", + "Next, let's check for stop words in our dataset. We can use the `validmind.data_validation.StopWords` test and pass our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.StopWords\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Punctuations\n", + "\n", + "Next, let's check for punctuations in our dataset. We can use the `validmind.data_validation.Punctuations` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.Punctuations\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Common Words\n", + "\n", + "Next, let's check for common words in our dataset. We can use the `validmind.data_validation.CommonWord` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.CommonWords\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Language Detection\n", + "\n", + "For documentation purposes, we can detect and log the languages used in the dataset with the `validmind.data_validation.LanguageDetection` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.LanguageDetection\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Toxicity Score\n", + "\n", + "Now, let's go ahead and run the `validmind.data_validation.nlp.Toxicity` test to compute a toxicity score for our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.Toxicity\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Polarity and Subjectivity\n", + "\n", + "We can also run the `validmind.data_validation.nlp.PolarityAndSubjectivity` test to compute the polarity and subjectivity of our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.PolarityAndSubjectivity\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Sentiment\n", + "\n", + "Finally, we can run the `validmind.data_validation.nlp.Sentiment` test to plot the sentiment of our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.Sentiment\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Embedding Model\n", + "\n", + "Now that we have our dataset loaded and have run some data validation tests to assess and document the quality of our data, we can go ahead and initialize our embedding model. We will use the `text-embedding-3-small` model from OpenAI for this purpose wrapped in the `OpenAIEmbeddings` class from LangChain. This model will be used to \"embed\" our questions both for inserting the question-answer pairs from the \"train\" set into the vector store and for embedding the question from inputs when making predictions with our RAG model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embedding_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", + "\n", + "\n", + "def embed(input):\n", + " \"\"\"Returns a text embedding for the given text\"\"\"\n", + " return embedding_client.embed_query(input[\"question\"])\n", + "\n", + "\n", + "vm_embedder = vm.init_model(input_id=\"embedding_model\", predict_fn=embed)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What we have done here is to initialize the `OpenAIEmbeddings` class so it uses OpenAI's `text-embedding-3-small` model. We then created an `embed` function that takes in an `input` dictionary and uses the `embed_query` method of the embedding client to compute the embeddings of the `question`. We use an `embed` function since that is how ValidMind supports any custom model. We will use this strategy for the retrieval and generator models as well but you could also use, say, a HuggingFace model directly. See the documentation for more information on which model types are directly supported - [ValidMind Documentation](https://docs.validmind.ai/validmind/validmind.html)... Finally, we use the `init_model` function from the ValidMind Library to create a `VMModel` object that can be used in ValidMind tests. This also logs the model to our model documentation and any test that uses the model will be linked to the logged model and its metadata." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign Predictions\n", + "\n", + "To precompute the embeddings for our test set, we can call the `assign_predictions` method of our `vm_test_ds` object we created above. This will compute the embeddings for each question in the test set and store them in the a special prediction column of the test set thats linked to our `vm_embedder` model. This will allow us to use these embeddings later when we run tests against our embedding model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(vm_embedder)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run tests\n", + "\n", + "Now that everything is setup for the embedding model, we can go ahead and run some tests to assess and document the quality of our embeddings. We will use the `validmind.model_validation.embeddings.*` tests to compute a variety of metrics against our model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisRandomNoise\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " params={\"probability\": 0.3},\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisSynonyms\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " params={\"probability\": 0.3},\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisTranslation\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " params={\n", + " \"source_lang\": \"en\",\n", + " \"target_lang\": \"fr\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.CosineSimilarityHeatmap\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "run_test(\n", + " \"validmind.model_validation.embeddings.CosineSimilarityDistribution\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.EuclideanDistanceHeatmap\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.PCAComponentsPairwisePlots\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " params={\"n_components\": 3},\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.TSNEComponentsPairwisePlots\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " params={\"n_components\": 3, \"perplexity\": 20},\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup Vector Store\n", + "\n", + "Great, so now that we have assessed our embedding model and verified that it is performing well, we can go ahead and use it to compute embeddings for our question-answer pairs in the \"train\" set. We will then use these embeddings to insert the question-answer pairs into a vector store. We will use an in-memory `qdrant` vector database for demo purposes but any option would work just as well here. We will use the `QdrantClient` class from LangChain to interact with the vector store. This class will allow us to insert and search for embeddings in the vector store." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Generate embeddings for the Train Set\n", + "\n", + "We can use the same `assign_predictions` method from earlier except this time we will use the `vm_train_ds` object to compute the embeddings for the question-answer pairs in the \"train\" set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds.assign_predictions(vm_embedder)\n", + "print(vm_train_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Insert embeddings and questions into Vector DB\n", + "\n", + "Now that we have computed the embeddings for our question-answer pairs in the \"train\" set, we can go ahead and insert them into the vector store:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.vectorstores import Qdrant\n", + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_community.document_loaders import DataFrameLoader\n", + "\n", + "# load documents from dataframe\n", + "loader = DataFrameLoader(train_df, page_content_column=\"question\")\n", + "docs = loader.load()\n", + "# choose model using embedding client\n", + "embedding_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", + "\n", + "# setup vector datastore\n", + "qdrant = Qdrant.from_documents(\n", + " docs,\n", + " embedding_client,\n", + " location=\":memory:\", # Local mode with in-memory storage only\n", + " collection_name=\"rfp_rag_collection\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Retrieval Model\n", + "\n", + "Now that we have an embedding model and a vector database setup and loaded with our data, we need a Retrieval model that can search for similar question-answer pairs for a given input question. Once created, we can initialize this as a ValidMind model and `assign_predictions` to it just like our embedding model." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def retrieve(input):\n", + " contexts = []\n", + "\n", + " for result in qdrant.similarity_search_with_score(input[\"question\"]):\n", + " document, score = result\n", + " context = f\"Q: {document.page_content}\\n\"\n", + " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", + "\n", + " contexts.append(context)\n", + "\n", + " return contexts\n", + "\n", + "\n", + "vm_retriever = vm.init_model(input_id=\"retrieval_model\", predict_fn=retrieve)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(model=vm_retriever)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generation Model\n", + "\n", + "As the final piece of this simple RAG pipeline, we can create and initialize a generation model that will use the retrieved context to generate an answer to the input question. We will use the `gpt-3.5-turbo` model from OpenAI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI\n", + "\n", + "from validmind.models import Prompt\n", + "\n", + "\n", + "system_prompt = \"\"\"\n", + "You are an expert RFP AI assistant.\n", + "You are tasked with answering new RFP questions based on existing RFP questions and answers.\n", + "You will be provided with the existing RFP questions and answer pairs that are the most relevant to the new RFP question.\n", + "After that you will be provided with a new RFP question.\n", + "You will generate an answer and respond only with the answer.\n", + "Ignore your pre-existing knowledge and answer the question based on the provided context.\n", + "\"\"\".strip()\n", + "\n", + "openai_client = OpenAI()\n", + "\n", + "\n", + "def generate(input):\n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + "\n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator = vm.init_model(\n", + " input_id=\"generation_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's test it out real quick:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "vm_generator.predict(\n", + " pd.DataFrame(\n", + " {\"retrieval_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Prompt Evaluation\n", + "\n", + "Now that we have our generator model initialized, we can run some LLM-as-Judge tests to evaluate the system prompt. This will allow us to get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", + "\n", + "- **Examplar Bias**: When using multi-shot prompting, does the prompt contain an unbiased distribution of examples?\n", + "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", + "- **Clarity**: How clearly the prompt states the task.\n", + "- **Conciseness**: How succinctly the prompt states the task.\n", + "- **Instruction Framing**: Whether the prompt contains negative instructions.\n", + "- **Specificity**: How specific the prompt defines the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Bias\",\n", + " inputs={\n", + " \"model\": vm_generator,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Clarity\",\n", + " inputs={\n", + " \"model\": vm_generator,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Conciseness\",\n", + " inputs={\n", + " \"model\": vm_generator,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Delimitation\",\n", + " inputs={\n", + " \"model\": vm_generator,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.NegativeInstruction\",\n", + " inputs={\n", + " \"model\": vm_generator,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Specificity\",\n", + " inputs={\n", + " \"model\": vm_generator,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup RAG Pipeline Model\n", + "\n", + "Now that we have all of our individual \"component\" models setup and initialized we need some way to put them all together in a single \"pipeline\". We can use the `PipelineModel` class to do this. This ValidMind model type simply wraps any number of other ValidMind models and runs them in sequence. We can use a pipe(`|`) operator - in Python this is normally an `or` operator but we have overloaded it for easy pipeline creation - to chain together our models. We can then initialize this pipeline model and assign predictions to it just like any other model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_rag_model = vm.init_model(vm_retriever | vm_generator, input_id=\"rag_model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can `assign_predictions` to the pipeline model just like we did with the individual models. This will run the pipeline on the test set and store the results in the test set for later use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(model=vm_rag_model)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds._df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run tests" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## RAGAS evaluation\n", + "\n", + "Let's go ahead and run some of our new RAG tests against our model...\n", + "\n", + "> Note: these tests are still being developed and are not yet in a stable state. We are using advanced tests here that use LLM-as-Judge and other strategies to assess things like the relevancy of the retrieved context to the input question and the correctness of the generated answer when compared to the ground truth. There is more to come in this area so stay tuned!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Semantic Similarity\n", + "\n", + "The concept of Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated answer and the ground truth. This evaluation is based on the ground truth and the answer, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated answer and the ground truth.\n", + "\n", + "Measuring the semantic similarity between answers can offer valuable insights into the quality of the generated response. This evaluation utilizes a cross-encoder model to calculate the semantic similarity score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.SemanticSimilarity\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"response_column\": \"rag_model_prediction\",\n", + " \"reference_column\": \"ground_truth\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Entity Recall\n", + "\n", + "This test gives the measure of recall of the retrieved context, based on the number of entities present in both ground_truths and contexts relative to the number of entities present in the ground_truths alone. Simply put, it is a measure of what fraction of entities are recalled from ground_truths. This test is useful in fact-based use cases like tourism help desk, historical QA, etc. This test can help evaluate the retrieval mechanism for entities, based on comparison with entities present in ground_truths, because in cases where entities matter, we need the contexts which cover them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextEntityRecall\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"reference_column\": \"ground_truth\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Precision\n", + "\n", + "Context Precision is a test that evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This test is computed using the question, ground_truth and the contexts, with values ranging between 0 and 1, where higher scores indicate better precision." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecision\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " \"reference_column\": \"ground_truth\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Precision Without Reference\n", + "\n", + "This test evaluates whether retrieved contexts align well with the expected response for a given user input, without requiring a ground-truth reference. This test assesses the relevance of each retrieved context chunk by comparing it directly to the response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " \"response_column\": \"rag_model_prediction\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Faithfulness\n", + "\n", + "This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. The answer is scaled to (0,1) range. Higher the better.\n", + "\n", + "The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.Faithfulness\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"response_column\": \"rag_model_prediction\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Response Relevancy\n", + "\n", + "The Response Relevancy test, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information and higher scores indicate better relevancy. This test is computed using the question, the context and the answer.\n", + "\n", + "The Response Relevancy is defined as the mean cosine similartiy of the original question to a number of artifical questions, which where generated (reverse engineered) based on the answer.\n", + "\n", + "Please note, that eventhough in practice the score will range between 0 and 1 most of the time, this is not mathematically guranteed, due to the nature of the cosine similarity ranging from -1 to 1.\n", + "\n", + "> Note: This is a reference free test. If you’re looking to compare ground truth answer with generated answer refer to Answer Correctness.\n", + "\n", + "An answer is deemed relevant when it directly and appropriately addresses the original question. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the answer lacks completeness or contains redundant details. To calculate this score, the LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured. The underlying idea is that if the generated answer accurately addresses the initial question, the LLM should be able to generate questions from the answer that align with the original question." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"response_column\": \"rag_model_prediction\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Recall\n", + "\n", + "Context recall measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. It is computed based on the ground truth and the retrieved context, and the values range between 0 and 1, with higher values indicating better performance.\n", + "\n", + "To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextRecall\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " \"reference_column\": \"ground_truth\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Answer Correctness\n", + "\n", + "The assessment of Answer Correctness involves gauging the accuracy of the generated answer when compared to the ground truth. This evaluation relies on the ground truth and the answer, with scores ranging from 0 to 1. A higher score indicates a closer alignment between the generated answer and the ground truth, signifying better correctness.\n", + "\n", + "Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score.\n", + "\n", + "Factual correctness quantifies the factual overlap between the generated answer and the ground truth answer. This is done using the concepts of:\n", + "\n", + "- TP (True Positive): Facts or statements that are present in both the ground truth and the generated answer.\n", + "- FP (False Positive): Facts or statements that are present in the generated answer but not in the ground truth.\n", + "- FN (False Negative): Facts or statements that are present in the ground truth but not in the generated answer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.AnswerCorrectness\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"response_column\": \"rag_model_prediction\",\n", + " \"reference_column\": \"ground_truth\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Aspect Critic\n", + "\n", + "This is designed to assess submissions based on predefined aspects such as harmlessness and correctness. Additionally, users have the flexibility to define their own aspects for evaluating submissions according to their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not. This evaluation is performed using the ‘answer’ as input.\n", + "\n", + "Critiques within the LLM evaluators evaluate submissions based on the provided aspect. Ragas Critiques offers a range of predefined aspects like correctness, harmfulness, etc. Users can also define their own aspects for evaluating submissions based on their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.AspectCritic\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"response_column\": \"rag_model_prediction\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Noise Sensitivity\n", + "\n", + "This test is designed to evaluate the robustness of the RAG pipeline model against noise in the retrieved context. It works by checking how well the \"claims\" in the generated answer match up with the \"claims\" in the ground truth answer. If the generated answer contains \"claims\" from the contexts that the ground truth answer does not contain, those claims are considered incorrect. The score for each answer is the number of incorrect claims divided by the total number of claims. This *can* be interpreted as a measure of how sensitive the LLM is to \"noise\" in the context where \"noise\" is information that is relevant but should not be included in the answer since the ground truth answer does not contain it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.NoiseSensitivity\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"response_column\": \"rag_model_prediction\",\n", + " \"reference_column\": \"ground_truth\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Generation quality\n", + "\n", + "In this section, we evaluate the alignment and relevance of generated responses to reference outputs within our retrieval-augmented generation (RAG) application. We use metrics that assess various quality dimensions of the generated responses, including semantic similarity, structural alignment, and phrasing overlap. Semantic similarity metrics compare embeddings of generated and reference text to capture deeper contextual alignment, while overlap and alignment measures quantify how well the phrasing and structure of generated responses match the intended outputs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Token Disparity\n", + "\n", + "This test assesses the difference in token counts between the reference texts (ground truth) and the answers generated by the RAG model. It helps evaluate how well the model's outputs align with the expected length and level of detail in the reference texts. A significant disparity in token counts could signal issues with generation quality, such as excessive verbosity or insufficient detail. Consistently low token counts in generated answers compared to references might suggest that the model’s outputs are incomplete or overly concise, missing important contextual information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.TokenDisparity\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### ROUGE Score\n", + "\n", + "This test evaluates the quality of answers generated by the RAG model by measuring overlaps in n-grams, word sequences, and word pairs between the model output and the reference (ground truth) text. ROUGE, short for Recall-Oriented Understudy for Gisting Evaluation, assesses both precision and recall, providing a balanced view of how well the generated response captures the reference content. ROUGE precision measures the proportion of n-grams in the generated text that match the reference, highlighting relevance and conciseness, while ROUGE recall assesses the proportion of reference n-grams present in the generated text, indicating completeness and thoroughness. \n", + "\n", + "Low precision scores might reveal that the generated text includes redundant or irrelevant information, while low recall scores suggest omissions of essential details from the reference. Consistently low ROUGE scores could indicate poor overall alignment with the ground truth, suggesting the model may be missing key content or failing to capture the intended meaning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.RougeScore\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + " params={\n", + " \"metric\": \"rouge-1\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### BLEU Score\n", + "\n", + "The BLEU Score test evaluates the quality of answers generated by the RAG model by measuring n-gram overlap between the generated text and the reference (ground truth) text, with a specific focus on exact precision in phrasing. While ROUGE precision also assesses overlap, BLEU differs in two main ways: first, it applies a geometric average across multiple n-gram levels, capturing precise phrase alignment, and second, it includes a brevity penalty to prevent overly short outputs from inflating scores artificially. This added precision focus is valuable in RAG applications where strict adherence to reference language is essential, as BLEU emphasizes the match to exact phrasing. In contrast, ROUGE precision evaluates general content overlap without penalizing brevity, offering a broader sense of content alignment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.BleuScore\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### BERT Score\n", + "\n", + "This test evaluates the quality of the RAG generated answers using BERT embeddings to measure precision, recall, and F1 scores based on semantic similarity, rather than exact n-gram matches as in BLEU and ROUGE. This approach captures contextual meaning, making it valuable when wording differs but the intended message closely aligns with the reference. In RAG applications, the BERT score is especially useful for ensuring that generated answers convey the reference text’s meaning, even if phrasing varies. Consistently low scores indicate a lack of semantic alignment, suggesting the model may miss or misrepresent key content. Low precision may reflect irrelevant or redundant details, while low recall can indicate omissions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.BertScore\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### METEOR Score\n", + "\n", + "This test evaluates the quality of the generated answers by measuring alignment with the ground truth, emphasizing both accuracy and fluency. Unlike BLEU and ROUGE, which focus on n-gram matches, METEOR combines precision, recall, synonym matching, and word order, focusing at how well the generated text conveys meaning and reads naturally. This metric is especially useful for RAG applications where sentence structure and natural flow are crucial for clear communication. Lower scores may suggest alignment issues, indicating that the answers may lack fluency or key content. Discrepancies in word order or high fragmentation penalties can reveal problems with how the model constructs sentences, potentially affecting readability." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.MeteorScore\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Bias and Toxicity\n", + "\n", + "In this section, we use metrics like Toxicity Score and Regard Score to evaluate both the generated responses and the ground truth. These tests helps us detect any harmful, offensive, or inappropriate language and evaluate the level of bias and neutrality enabling us to assess and mitigate potential biases in both the model's responses and the original dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Toxicity Score\n", + "\n", + "This test measures the level of harmful or offensive content in the generated answers. The test uses a preloaded toxicity detection tool from Hugging Face, which identifies language that may be inappropriate, aggressive, or derogatory. High toxicity scores indicate potentially toxic content, while consistently elevated scores across multiple outputs may signal underlying issues in the model’s generation process that require attention to prevent the spread of harmful language." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ToxicityScore\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Regard Score\n", + "\n", + "This test evaluates the sentiment and perceived regard—categorized as positive, negative, neutral, or other—in answers generated by the RAG model. This is important for identifying any biases or sentiment tendencies in responses, ensuring that generated answers are balanced and appropriate for the context. The uses a preloaded regard evaluation tool from Hugging Face to compute scores for each response. High skewness in regard scores, especially if the generated responses consistently diverge from expected sentiments in the reference texts, may reveal biases in the model’s generation, such as overly positive or negative tones where neutrality is expected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.RegardScore\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "In this notebook, we have seen how we can use LangChain and ValidMind together to build, evaluate and document a simple RAG Model as its developed. This is a great example of the interactive development experience that ValidMind is designed to support. We can quickly iterate on our model and document as we go... We have seen how ValidMind supports non-traditional \"models\" using a functional interface and how we can build pipelines of many models to support complex GenAI workflows.\n", + "\n", + "This is still a work in progress and we are actively developing new tests to support more advanced GenAI workflows. We are also keeping an eye on the most popular GenAI models and libraries to explore direct integrations. Stay tuned for more updates and new features in this area!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-397fa35a68a34dc38f5d84d797fb5331", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-py3.10", + "language": "python", + "name": "validmind-py3.10" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb b/site/notebooks/use_cases/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb similarity index 99% rename from site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb rename to site/notebooks/use_cases/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb index 1ff6ca3e05..ebdbaeae07 100644 --- a/site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb +++ b/site/notebooks/use_cases/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb @@ -108,7 +108,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -169,8 +169,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/code_samples/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb b/site/notebooks/use_cases/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb similarity index 99% rename from site/notebooks/code_samples/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb rename to site/notebooks/use_cases/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb index 04f64a31ec..3cac66c032 100644 --- a/site/notebooks/code_samples/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb +++ b/site/notebooks/use_cases/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb @@ -106,7 +106,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", "\n", @@ -167,8 +167,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." @@ -525,7 +523,7 @@ "source": [ "Next, let's run *comparison tests*, which will allow comparing differences between the training dataset and monitoring datasets. To run a test in comparison mode, you only need to pass an `input_grid` parameter to the `run_test()` method instead of `inputs`.\n", "\n", - "For more information about comparison tests, see this [notebook](../../how_to/run_tests/2_run_comparison_tests.ipynb)." + "For more information about comparison tests, see this [notebook](../../how_to/tests/run_tests/2_run_comparison_tests.ipynb)." ] }, { diff --git a/site/notebooks/code_samples/ongoing_monitoring/xgboost_model.model b/site/notebooks/use_cases/ongoing_monitoring/xgboost_model.model similarity index 100% rename from site/notebooks/code_samples/ongoing_monitoring/xgboost_model.model rename to site/notebooks/use_cases/ongoing_monitoring/xgboost_model.model diff --git a/site/notebooks/code_samples/regression/quickstart_regression_full_suite.ipynb b/site/notebooks/use_cases/regression/quickstart_regression_full_suite.ipynb similarity index 99% rename from site/notebooks/code_samples/regression/quickstart_regression_full_suite.ipynb rename to site/notebooks/use_cases/regression/quickstart_regression_full_suite.ipynb index 90992448e1..91765950e0 100644 --- a/site/notebooks/code_samples/regression/quickstart_regression_full_suite.ipynb +++ b/site/notebooks/use_cases/regression/quickstart_regression_full_suite.ipynb @@ -137,8 +137,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/code_samples/time_series/quickstart_time_series_full_suite.ipynb b/site/notebooks/use_cases/time_series/quickstart_time_series_full_suite.ipynb similarity index 99% rename from site/notebooks/code_samples/time_series/quickstart_time_series_full_suite.ipynb rename to site/notebooks/use_cases/time_series/quickstart_time_series_full_suite.ipynb index 27e5727179..c154f66f6c 100644 --- a/site/notebooks/code_samples/time_series/quickstart_time_series_full_suite.ipynb +++ b/site/notebooks/use_cases/time_series/quickstart_time_series_full_suite.ipynb @@ -106,7 +106,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -171,8 +171,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Attrition/Churn Management`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." diff --git a/site/notebooks/code_samples/time_series/quickstart_time_series_high_code.ipynb b/site/notebooks/use_cases/time_series/quickstart_time_series_high_code.ipynb similarity index 99% rename from site/notebooks/code_samples/time_series/quickstart_time_series_high_code.ipynb rename to site/notebooks/use_cases/time_series/quickstart_time_series_high_code.ipynb index 33a2e7220a..da5b9051b1 100644 --- a/site/notebooks/code_samples/time_series/quickstart_time_series_high_code.ipynb +++ b/site/notebooks/use_cases/time_series/quickstart_time_series_high_code.ipynb @@ -107,7 +107,7 @@ " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", "\n", "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", "\n", @@ -172,8 +172,6 @@ "\n", "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", "\n", - " For example, to register a model for use with this notebook, select the following use case: `Marketing/Sales - Analytics`\n", - "\n", "4. Select your own name under the **MODEL OWNER** drop-down.\n", "\n", "5. Click **Register Model** to add the model to your inventory." @@ -191,7 +189,7 @@ "\n", "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", "\n", - "2. Under **TEMPLATE**, select `Time Series Forecasting`.\n", + "2. Under **TEMPLATE**, select `Time Series Forecasting with ML`.\n", "\n", "3. Click **Use Template** to apply the template." ] diff --git a/site/releases/2023/2023-nov-09/highlights.qmd b/site/releases/2023/2023-nov-09/highlights.qmd index 3c82b2c2fe..ae433a0d82 100644 --- a/site/releases/2023/2023-nov-09/highlights.qmd +++ b/site/releases/2023/2023-nov-09/highlights.qmd @@ -151,7 +151,7 @@ You can easily switch between views using the **Display Table** or **Display Car ::: {.w-30-ns} -[Test descriptions](/developer/model-testing/test-descriptions.qmd){.button} +[Test descriptions](/developer/test-descriptions.qmd){.button} ::: diff --git a/site/releases/2023/2023-oct-25/highlights.qmd b/site/releases/2023/2023-oct-25/highlights.qmd index d3ae1a598c..2103272349 100644 --- a/site/releases/2023/2023-oct-25/highlights.qmd +++ b/site/releases/2023/2023-oct-25/highlights.qmd @@ -202,7 +202,7 @@ We now programmatically embed our Jupyter Notebooks in our documentation site an ::: ::: {.w-30-ns} -[Run tests and test suites](/developer/model-testing/testing-overview.qmd){.button} +[Run tests and test suites](/developer/how-to/testing-overview.qmd){.button} ::: diff --git a/site/releases/2023/2023-sep-27/highlights.qmd b/site/releases/2023/2023-sep-27/highlights.qmd index 691aab12a5..6e1d0f074c 100644 --- a/site/releases/2023/2023-sep-27/highlights.qmd +++ b/site/releases/2023/2023-sep-27/highlights.qmd @@ -60,12 +60,12 @@ We added a new notebook in the {{< var vm.developer >}} that includes the financ :::: {.flex .flex-wrap .justify-around} ::: {.w-50-ns} -[Prompt validation for large language models (LLMs)](/notebooks/code_samples/nlp_and_llm/prompt_validation_demo.ipynb){.button .button-green} +[Prompt validation for large language models (LLMs)](/notebooks/use_cases/nlp_and_llm/prompt_validation_demo.ipynb){.button .button-green} ::: ::: {.w-50-ns} -[Automate news summarization using LLMs](/notebooks/code_samples/nlp_and_llm/llm_summarization_demo.ipynb){.button .button-green} +[Automate news summarization using LLMs](/notebooks/use_cases/nlp_and_llm/llm_summarization_demo.ipynb){.button .button-green} ::: @@ -79,7 +79,7 @@ We added a new notebook in the {{< var vm.developer >}} that includes the financ To illustrate this new feature, we have included a **financial news sentiment analysis demo** that runs documentation tests for a Hugging Face model with text classification using the `financial_phrasebank`:[^2] ::: {.tc} -[Sentiment analysis of financial data using Hugging Face NLP models](/notebooks/code_samples/nlp_and_llm/hugging_face_integration_demo.ipynb){.button .button-green} +[Sentiment analysis of financial data using Hugging Face NLP models](/notebooks/use_cases/nlp_and_llm/hugging_face_integration_demo.ipynb){.button .button-green} ::: @@ -285,7 +285,7 @@ We made a number of changes to tests to improve the developer experience: ::: ::: {.w-20-ns} -[Test descriptions](/developer/model-testing/test-descriptions.qmd){.button} +[Test descriptions](/developer/test-descriptions.qmd){.button} ::: diff --git a/site/releases/2023/release-notes-2023-jun-22.qmd b/site/releases/2023/release-notes-2023-jun-22.qmd index 4afe9541a7..eeb5fa428a 100644 --- a/site/releases/2023/release-notes-2023-jun-22.qmd +++ b/site/releases/2023/release-notes-2023-jun-22.qmd @@ -25,7 +25,7 @@ With support for custom tests, you can now go beyond the default set of document ::: ::: {.w-30-ns} -[Implement custom tests](/notebooks/code_samples/custom_tests/implement_custom_tests.ipynb){.button .button-green} +[Implement custom tests](/notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb){.button .button-green} ::: @@ -42,7 +42,7 @@ With test providers, you can now integrate external test libraries to expand the ::: ::: {.w-40-ns .tc} -[Integrate external test providers](/notebooks/code_samples/custom_tests/integrate_external_test_providers.ipynb){.button .button-green} +[Integrate external test providers](/notebooks/how_to/tests/custom_tests/integrate_external_test_providers.ipynb){.button .button-green} ::: diff --git a/site/releases/2023/release-notes-2023-may-30.qmd b/site/releases/2023/release-notes-2023-may-30.qmd index 01c55a851d..31eecb08ec 100644 --- a/site/releases/2023/release-notes-2023-may-30.qmd +++ b/site/releases/2023/release-notes-2023-may-30.qmd @@ -34,7 +34,7 @@ Plots and visual outputs have been enhanced with the Plotly package. Users can n ::: ::: {.w-30-ns .tc} -[Test descriptions](/developer/model-testing/test-descriptions.qmd){.button .button-green} +[Test descriptions](/developer/test-descriptions.qmd){.button .button-green} ::: diff --git a/site/releases/2024/2024-aug-13/release-notes.qmd b/site/releases/2024/2024-aug-13/release-notes.qmd index 9b9b260bfb..5a1fa8d773 100644 --- a/site/releases/2024/2024-aug-13/release-notes.qmd +++ b/site/releases/2024/2024-aug-13/release-notes.qmd @@ -32,7 +32,7 @@ To make comparison tests easier to analyze, we've added support to specify an in ::: {.w-30-ns} ::: {.tc} -[Run comparison tests](/notebooks/how_to/run_tests/2_run_comparison_tests.ipynb){.button .button-green} +[Run comparison tests](/notebooks/how_to/tests/run_tests/2_run_comparison_tests.ipynb){.button .button-green} ::: ::: diff --git a/site/releases/2024/2024-dec-24/release-notes.qmd b/site/releases/2024/2024-dec-24/release-notes.qmd index 7c9a1be8a2..ca025ecd0a 100644 --- a/site/releases/2024/2024-dec-24/release-notes.qmd +++ b/site/releases/2024/2024-dec-24/release-notes.qmd @@ -45,9 +45,9 @@ We've introduced two new quickstart Jupyter Notebooks dealing with option pricin ::: ::: {.w-40-ns .tc} -[Knockout option pricing model](/notebooks/code_samples/capital_markets/quickstart_option_pricing_models.ipynb){.button .button-green} +[Knockout option pricing model](/notebooks/use_cases/capital_markets/quickstart_option_pricing_models.ipynb){.button .button-green} -[Heston option pricing model](/notebooks/code_samples/capital_markets/quickstart_option_pricing_models_quantlib.ipynb){.button .button-green} +[Heston option pricing model](/notebooks/use_cases/capital_markets/quickstart_option_pricing_models_quantlib.ipynb){.button .button-green} ::: diff --git a/site/releases/2024/2024-feb-14/highlights.qmd b/site/releases/2024/2024-feb-14/highlights.qmd index 774544a832..e5f36cb898 100644 --- a/site/releases/2024/2024-feb-14/highlights.qmd +++ b/site/releases/2024/2024-feb-14/highlights.qmd @@ -24,7 +24,7 @@ Documentation templates have been updated to support logging each test run as a ::: ::: {.w-50-ns .tc} -[Document multiple results for the same test](/notebooks/how_to/document_multiple_results_for_the_same_test.ipynb){.button .button-green} +[Document multiple results for the same test](/notebooks/how_to/tests/run_tests/documentation_tests/document_multiple_results_for_the_same_test.ipynb){.button .button-green} ::: @@ -148,7 +148,7 @@ To enable model developers to know what task types and tags are available to fil ::: ::: {.w-30-ns .tc} -[Explore tests](/notebooks/how_to/explore_tests.ipynb){.button .button-green} +[Explore tests](/notebooks/how_to/tests/explore_tests/explore_tests.ipynb){.button .button-green} ::: diff --git a/site/releases/2024/2024-jan-26/highlights.qmd b/site/releases/2024/2024-jan-26/highlights.qmd index b2e7d0c0e1..7d4f2e3688 100644 --- a/site/releases/2024/2024-jan-26/highlights.qmd +++ b/site/releases/2024/2024-jan-26/highlights.qmd @@ -133,7 +133,7 @@ A new notebook illustrates how you can configure these dataset features: - How `feature_columns` can be used to report by segment ::: {.tc} -[Configure dataset features](/notebooks/how_to/configure_dataset_features.ipynb){.button.button-green} +[Configure dataset features](/notebooks/how_to/data_and_datasets/dataset_inputs/configure_dataset_features.ipynb){.button.button-green} ::: ::: @@ -209,7 +209,7 @@ full_suite = vm.run_documentation_tests( ``` ::: {.tc} -[Run individual documentation sections](/notebooks/how_to/run_documentation_sections.ipynb){.button .button-green} +[Run individual documentation sections](/notebooks/how_to/tests/run_tests/documentation_tests/run_documentation_sections.ipynb){.button .button-green} ::: ::: diff --git a/site/releases/2024/2024-jul-22/release-notes.qmd b/site/releases/2024/2024-jul-22/release-notes.qmd index b37adfacc8..c48cea4673 100644 --- a/site/releases/2024/2024-jul-22/release-notes.qmd +++ b/site/releases/2024/2024-jul-22/release-notes.qmd @@ -44,7 +44,7 @@ You can now run comparison tests with the {{< var validmind.developer >}}. ::: ::: {.w-30-ns} -[Run comparison tests](/notebooks/how_to/run_tests/2_run_comparison_tests.ipynb){.button .button-green} +[Run comparison tests](/notebooks/how_to/tests/run_tests/2_run_comparison_tests.ipynb){.button .button-green} ::: @@ -294,7 +294,7 @@ We fixed a number of missing test descriptions that were caused by a scripting i ::: {.w-20-ns} -[Test descriptions](/developer/model-testing/test-descriptions.qmd){.button} +[Test descriptions](/developer/test-descriptions.qmd){.button} ::: diff --git a/site/releases/2024/2024-may-22/release-notes.qmd b/site/releases/2024/2024-may-22/release-notes.qmd index 2e8ff71423..15ad97639d 100644 --- a/site/releases/2024/2024-may-22/release-notes.qmd +++ b/site/releases/2024/2024-may-22/release-notes.qmd @@ -1097,5 +1097,5 @@ These features provide a rich context for testing and evaluation. You can use re -[^1]: See the full list of tests at [Test descriptions](/developer/model-testing/test-descriptions.qmd) or try the [Test sandbox](/developer/model-testing/test-sandbox.qmd). +[^1]: See the full list of tests at [Test descriptions](/developer/test-descriptions.qmd) or try the [Test sandbox](/developer/how-to/test-sandbox.qmd). diff --git a/site/releases/2025/2025-jan-31/release-notes.qmd b/site/releases/2025/2025-jan-31/release-notes.qmd index c27f371fac..91932ac527 100644 --- a/site/releases/2025/2025-jan-31/release-notes.qmd +++ b/site/releases/2025/2025-jan-31/release-notes.qmd @@ -9,7 +9,7 @@ listing: max-description-length: 250 # image-height: 100% contents: - - path: /notebooks/how_to/customize_test_result_descriptions.ipynb + - path: /notebooks/how_to/tests/run_tests/configure_tests/customize_test_result_descriptions.ipynb title: "Customize test result descriptions" description: "Learn how to take control over the context that drives test description generation. {{< fa chevron-right >}}" - id: credit-risk @@ -357,7 +357,7 @@ This enhancement makes it easier to distinguish between ValidMind's standard tes ::: ::: {.w-30-ns .tr} -[Test descriptions](/developer/model-testing/test-descriptions.qmd){.button target="_blank"} +[Test descriptions](/developer/test-descriptions.qmd){.button target="_blank"} ::: :::: diff --git a/site/scripts/developer-sidebar/update_how_tos.py b/site/scripts/developer-sidebar/update_how_tos.py new file mode 100644 index 0000000000..957e066160 --- /dev/null +++ b/site/scripts/developer-sidebar/update_how_tos.py @@ -0,0 +1,282 @@ +# Copyright © 2023-2026 ValidMind Inc. All rights reserved. +# Refer to the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial +""" +Update developer/_sidebar.yaml and developer/how-to/feature-overview.qmd +so that each subdirectory of notebooks/how_to/ (excluding tests/) is listed +explicitly in alphabetical order, with fixed capitalization where needed. + +Run from the site/ directory, e.g.: + cd site && python scripts/developer-sidebar/update_how_tos.py +""" + +import os +import re +from pathlib import Path + + +# Display title for directories that need fixed capitalization (e.g. acronyms) +SPECIAL_TITLES = { + "data_and_datasets": "Data and datasets", + "tests": "Testing", +} + +# Subdirectories to exclude from the sidebar only (tests has its own +# dedicated section in the sidebar already). +SIDEBAR_EXCLUDED_DIRS = {"tests"} + + +def dir_to_title(dirname: str) -> str: + """Convert directory name to display title (sentence-style capitalization).""" + if dirname in SPECIAL_TITLES: + return SPECIAL_TITLES[dirname] + return dirname.replace("_", " ").capitalize() + + +def dir_to_listing_id(dirname: str) -> str: + """Convert a directory name to a listing id (hyphens instead of underscores).""" + return dirname.replace("_", "-") + + +def _has_notebooks(directory: Path) -> bool: + """Check if a directory directly contains .ipynb files.""" + return any( + f.endswith(".ipynb") + for f in os.listdir(directory) + if (directory / f).is_file() + ) + + +def _has_notebooks_recursive(directory: Path) -> bool: + """Check if a directory or any of its descendants contains .ipynb files.""" + for _root, _dirs, files in os.walk(directory): + if any(f.endswith(".ipynb") for f in files): + return True + return False + + +def _build_section_yaml( + how_to_base: Path, rel_path: str, dirname: str, indent: int +) -> list[str]: + """Build YAML lines for a how-to section, nesting subdirectories. + + If the directory has subdirectories that contain notebooks, each one + becomes a nested ``section`` (recursively). Subdirectories without any + notebooks are ignored. Top-level notebooks within the directory are + included via a non-recursive glob (``*.ipynb``). Leaf directories use a + recursive glob (``**/*.ipynb``) for simplicity. + """ + full_path = how_to_base / rel_path + subdirs = sorted( + d + for d in os.listdir(full_path) + if (full_path / d).is_dir() and _has_notebooks_recursive(full_path / d) + ) + + prefix = " " * indent + title = dir_to_title(dirname) + lines = [f'{prefix}- section: "{title}"'] + + if not subdirs: + # Leaf directory — simple recursive glob. + lines.append(f'{prefix} contents: "notebooks/how_to/{rel_path}/**/*.ipynb"') + else: + # Has subdirectories — build an explicit contents list. + lines.append(f"{prefix} contents:") + + # List top-level notebooks explicitly (bare glob strings in a YAML + # list are not resolved by Quarto). + for nb in sorted( + f for f in os.listdir(full_path) + if f.endswith(".ipynb") and (full_path / f).is_file() + ): + lines.append(f"{prefix} - notebooks/how_to/{rel_path}/{nb}") + + # Recurse into each subdirectory. + for subdir in subdirs: + sub_rel = f"{rel_path}/{subdir}" + lines.extend( + _build_section_yaml(how_to_base, sub_rel, subdir, indent + 4) + ) + + return lines + + +def update_sidebar(base: Path, subdirs: list) -> None: + """Update developer/_sidebar.yaml with how-to subdirectories.""" + sidebar_path = base / "developer" / "_sidebar.yaml" + + if not sidebar_path.is_file(): + raise SystemExit(f"Sidebar file not found: {sidebar_path}") + + how_to_base = base / "notebooks" / "how_to" + + # Build the new contents block (YAML). Use "section" so Quarto renders + # expandable accordion items; "text" alone does not expand. + lines = [" contents:"] + for d in subdirs: + lines.extend(_build_section_yaml(how_to_base, d, d, 12)) + + new_block = "\n".join(lines) + + text = sidebar_path.read_text() + + # Try to find and replace an existing expanded contents block under + # "Use library features" (re-run case). Match the header lines then + # consume all indented content lines (12+ spaces) that follow. + pattern = re.compile( + r'( - text: "Use library features"\n' + r" file: developer/how-to/feature-overview\.qmd\n)" + r" contents:\n" + r"(?:[ ]{12,}[^\n]*\n)*", + re.MULTILINE, + ) + match = pattern.search(text) + if match: + # Replace existing contents block, keeping the header lines. + text = ( + text[: match.start()] + + match.group(1) + + new_block + + "\n" + + text[match.end() :] + ) + else: + # First run: insert contents block right after the file: line. + insert_pattern = re.compile( + r'( - text: "Use library features"\n' + r" file: developer/how-to/feature-overview\.qmd\n)", + re.MULTILINE, + ) + insert_match = insert_pattern.search(text) + if not insert_match: + raise SystemExit( + 'Could not find "Use library features" entry in sidebar. ' + "Has the sidebar format changed?" + ) + text = ( + text[: insert_match.end()] + + new_block + + "\n" + + text[insert_match.end() :] + ) + + sidebar_path.write_text(text) + print(f"Updated {sidebar_path} with {len(subdirs)} how-to directories.") + + +def _split_frontmatter(text: str): + """Split a .qmd file into (frontmatter_lines, body) at the closing ---.""" + lines = text.split("\n") + # First line must be --- + if not lines or lines[0].strip() != "---": + raise SystemExit("File does not start with YAML front matter (---).") + # Find the closing --- + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + frontmatter = lines[: i + 1] # includes both --- delimiters + body = "\n".join(lines[i + 1 :]) + return frontmatter, body + raise SystemExit("Could not find closing --- for YAML front matter.") + + +def update_feature_overview(base: Path, subdirs: list) -> None: + """Update feature-overview.qmd with listings and panel tabset + for each how-to subdirectory.""" + page_path = base / "developer" / "how-to" / "feature-overview.qmd" + + if not page_path.is_file(): + raise SystemExit(f"Feature overview page not found: {page_path}") + + text = page_path.read_text() + + # --- Build new listing YAML lines --- + listing_lines = ["listing:"] + for d in subdirs: + listing_id = dir_to_listing_id(d) + listing_lines.append(f" - id: {listing_id}") + listing_lines.append(f" type: grid") + listing_lines.append(f" grid-columns: 2") + listing_lines.append(f' image-placeholder: "jupyter-logo-rectangle.svg"') + listing_lines.append(f" max-description-length: 350") + listing_lines.append(f' image-height: "100%"') + listing_lines.append(f" fields: [title, description, reading-time]") + listing_lines.append(f' contents: "../../notebooks/how_to/{d}/**/*.ipynb"') + + # --- Update YAML front matter --- + frontmatter, body = _split_frontmatter(text) + + # Remove any existing listing: block from the front matter + fm_text = "\n".join(frontmatter) + fm_text = re.sub(r"\nlisting:\n(?:[ ].*\n)*", "\n", fm_text) + + # Re-split so we can insert the listing block before the closing --- + fm_lines = fm_text.split("\n") + # The last element should be --- + closing = fm_lines.pop() # remove closing --- + fm_lines.extend(listing_lines) + fm_lines.append(closing) + + # --- Build new panel-tabset block --- + tabset_lines = ["## How-to by topic", "", ":::{.panel-tabset}", ""] + for d in subdirs: + listing_id = dir_to_listing_id(d) + title = dir_to_title(d) + tabset_lines.append(f"## {title}") + tabset_lines.append("") + tabset_lines.append(f":::{{#{listing_id}}}") + tabset_lines.append(":::") + tabset_lines.append("") + tabset_lines.append(":::") + tabset_lines.append("") + + new_tabset = "\n".join(tabset_lines) + + # Replace everything from "## How-to by topic" to end of body. + feature_pattern = re.compile( + r"## How-to by topic\n?.*", re.DOTALL + ) + if feature_pattern.search(body): + body = feature_pattern.sub(new_tabset, body) + else: + # Heading not found; append the tabset at the end + body = body.rstrip("\n") + "\n\n" + new_tabset + + # --- Reassemble and write --- + result = "\n".join(fm_lines) + "\n" + body + page_path.write_text(result) + print(f"Updated {page_path} with {len(subdirs)} how-to listings.") + + +def main() -> None: + # Run from site/ or repo root + cwd = Path.cwd() + if (cwd / "notebooks" / "how_to").is_dir(): + base = cwd + elif (cwd / "site" / "notebooks" / "how_to").is_dir(): + base = cwd / "site" + else: + raise SystemExit( + "Run from site/ or repo root " + "(e.g. cd site && python scripts/developer-sidebar/update_how_tos.py)" + ) + how_to = base / "notebooks" / "how_to" + + all_subdirs = sorted( + d for d in os.listdir(how_to) if (how_to / d).is_dir() + ) + + # Sidebar excludes tests (it has its own dedicated section). + sidebar_subdirs = [d for d in all_subdirs if d not in SIDEBAR_EXCLUDED_DIRS] + + # Feature overview includes all dirs, with tests listed first. + overview_subdirs = [d for d in all_subdirs if d in SIDEBAR_EXCLUDED_DIRS] + [ + d for d in all_subdirs if d not in SIDEBAR_EXCLUDED_DIRS + ] + + update_sidebar(base, sidebar_subdirs) + update_feature_overview(base, overview_subdirs) + + +if __name__ == "__main__": + main() diff --git a/site/scripts/developer-sidebar/update_use_cases.py b/site/scripts/developer-sidebar/update_use_cases.py new file mode 100644 index 0000000000..0e8713dc8c --- /dev/null +++ b/site/scripts/developer-sidebar/update_use_cases.py @@ -0,0 +1,228 @@ +# Copyright © 2023-2026 ValidMind Inc. All rights reserved. +# Refer to the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial +""" +Update developer/_sidebar.yaml and developer/samples-jupyter-notebooks.qmd +so that each subdirectory of notebooks/use_cases/ is listed explicitly +in alphabetical order, with fixed capitalization for NLP and LLM. + +Run from the site/ directory, e.g.: + cd site && python scripts/developer-sidebar/update_use_cases.py +""" + +import os +import re +from pathlib import Path + + +# Display title for directories that need fixed capitalization (e.g. acronyms) +SPECIAL_TITLES = { + "nlp_and_llm": "NLP and LLM", +} + + +def dir_to_title(dirname: str) -> str: + """Convert directory name to sidebar display title (sentence-style capitalization).""" + if dirname in SPECIAL_TITLES: + return SPECIAL_TITLES[dirname] + return dirname.replace("_", " ").capitalize() + + +def dir_to_listing_id(dirname: str) -> str: + """Convert a directory name to a listing id (hyphens instead of underscores).""" + return dirname.replace("_", "-") + + +def _has_notebooks(directory: Path) -> bool: + """Check if a directory directly contains .ipynb files.""" + return any( + f.endswith(".ipynb") + for f in os.listdir(directory) + if (directory / f).is_file() + ) + + +def _has_notebooks_recursive(directory: Path) -> bool: + """Check if a directory or any of its descendants contains .ipynb files.""" + for _root, _dirs, files in os.walk(directory): + if any(f.endswith(".ipynb") for f in files): + return True + return False + + +def _build_section_yaml( + use_cases_base: Path, rel_path: str, dirname: str, indent: int +) -> list[str]: + """Build YAML lines for a use-case section, nesting subdirectories. + + If the directory has subdirectories that contain notebooks, each one + becomes a nested ``section`` (recursively). Subdirectories without any + notebooks are ignored. Top-level notebooks within the directory are + included via a non-recursive glob (``*.ipynb``). Leaf directories use a + recursive glob (``**/*.ipynb``) for simplicity. + """ + full_path = use_cases_base / rel_path + subdirs = sorted( + d + for d in os.listdir(full_path) + if (full_path / d).is_dir() and _has_notebooks_recursive(full_path / d) + ) + + prefix = " " * indent + title = dir_to_title(dirname) + lines = [f'{prefix}- section: "{title}"'] + + if not subdirs: + # Leaf directory — simple recursive glob. + lines.append(f'{prefix} contents: "notebooks/use_cases/{rel_path}/**/*.ipynb"') + else: + # Has subdirectories — build an explicit contents list. + lines.append(f"{prefix} contents:") + + # List top-level notebooks explicitly (bare glob strings in a YAML + # list are not resolved by Quarto). + for nb in sorted( + f for f in os.listdir(full_path) + if f.endswith(".ipynb") and (full_path / f).is_file() + ): + lines.append(f"{prefix} - notebooks/use_cases/{rel_path}/{nb}") + + # Recurse into each subdirectory. + for subdir in subdirs: + sub_rel = f"{rel_path}/{subdir}" + lines.extend( + _build_section_yaml(use_cases_base, sub_rel, subdir, indent + 4) + ) + + return lines + + +def update_sidebar(base: Path, subdirs: list) -> None: + """Update developer/_sidebar.yaml with use case subdirectories.""" + sidebar_path = base / "developer" / "_sidebar.yaml" + + if not sidebar_path.is_file(): + raise SystemExit(f"Sidebar file not found: {sidebar_path}") + + use_cases_base = base / "notebooks" / "use_cases" + + # Build the new contents block (YAML). Use "section" so Quarto renders + # expandable accordion items; "text" alone does not expand. + lines = [" contents:"] + for d in subdirs: + lines.extend(_build_section_yaml(use_cases_base, d, d, 12)) + + new_block = "\n".join(lines) + + text = sidebar_path.read_text() + # Replace either the single wildcard or an existing expanded block. + # Accept both the old "code_samples" and the new "use_cases" paths so the + # script works on first migration as well as on subsequent re-runs. + old_single_use_cases = ' contents: "notebooks/use_cases/**"' + old_single_code_samples = ' contents: "notebooks/code_samples/**"' + if old_single_use_cases in text: + text = text.replace(old_single_use_cases, new_block, 1) + elif old_single_code_samples in text: + text = text.replace(old_single_code_samples, new_block, 1) + else: + # Find the contents block and replace it (multi-line). Match the + # header lines then consume all indented content lines (12+ spaces). + pattern = re.compile( + r'( - text: "(?:Code samples|Use cases)"\n' + r" file: developer/samples-jupyter-notebooks\.qmd\n)" + r" contents:\n" + r"(?:[ ]{12,}[^\n]*\n)*", + re.MULTILINE, + ) + match = pattern.search(text) + if not match: + raise SystemExit( + "Could not find Code samples / Use cases contents block in sidebar. " + "Has the sidebar format changed?" + ) + text = text[: match.start()] + match.group(1) + new_block + "\n" + text[match.end() :] + sidebar_path.write_text(text) + print(f"Updated {sidebar_path} with {len(subdirs)} use case directories.") + + +def update_notebooks_page(base: Path, subdirs: list) -> None: + """Update samples-jupyter-notebooks.qmd with listings and panel tabset + for each use case subdirectory.""" + page_path = base / "developer" / "samples-jupyter-notebooks.qmd" + + if not page_path.is_file(): + raise SystemExit(f"Notebooks page not found: {page_path}") + + text = page_path.read_text() + + # --- Build new listing YAML block --- + listing_lines = ["listing:"] + for d in subdirs: + listing_id = dir_to_listing_id(d) + listing_lines.append(f" - id: {listing_id}") + listing_lines.append(f' type: grid') + listing_lines.append(f' grid-columns: 2') + listing_lines.append(f' image-placeholder: "jupyter-logo-rectangle.svg"') + listing_lines.append(f' max-description-length: 350') + listing_lines.append(f' image-height: "100%"') + listing_lines.append(f' fields: [title, description, reading-time]') + listing_lines.append(f' contents: "../notebooks/use_cases/{d}/*.ipynb"') + + new_listing_block = "\n".join(listing_lines) + "\n" + + # Replace the listing block in the YAML frontmatter. + # Match "listing:" at the start of a line and all following indented lines. + listing_pattern = re.compile( + r'^listing:\n(?:[ ].*\n)*', + re.MULTILINE, + ) + text = listing_pattern.sub(new_listing_block, text, count=1) + + # --- Build new panel-tabset block --- + tabset_lines = ["## By use case", "", ":::{.panel-tabset}", ""] + for d in subdirs: + listing_id = dir_to_listing_id(d) + title = dir_to_title(d) + tabset_lines.append(f"## {title}") + tabset_lines.append("") + tabset_lines.append(f":::{{#{listing_id}}}") + tabset_lines.append(":::") + tabset_lines.append("") + tabset_lines.append(":::") + tabset_lines.append("") + + new_tabset = "\n".join(tabset_lines) + + # Replace everything from "## By use case" to end of file. + use_case_pattern = re.compile(r'^## By use case\n.*', re.MULTILINE | re.DOTALL) + text = use_case_pattern.sub(new_tabset, text) + + page_path.write_text(text) + print(f"Updated {page_path} with {len(subdirs)} use case listings.") + + +def main() -> None: + # Run from site/ or repo root + cwd = Path.cwd() + if (cwd / "notebooks" / "use_cases").is_dir(): + base = cwd + elif (cwd / "site" / "notebooks" / "use_cases").is_dir(): + base = cwd / "site" + else: + raise SystemExit( + "Run from site/ or repo root " + "(e.g. cd site && python scripts/developer-sidebar/update_use_cases.py)" + ) + use_cases = base / "notebooks" / "use_cases" + + subdirs = sorted( + d for d in os.listdir(use_cases) + if (use_cases / d).is_dir() + ) + + update_sidebar(base, subdirs) + update_notebooks_page(base, subdirs) + + +if __name__ == "__main__": + main() diff --git a/site/scripts/update_code_samples_sidebar.py b/site/scripts/update_code_samples_sidebar.py deleted file mode 100644 index c9d31fa5d3..0000000000 --- a/site/scripts/update_code_samples_sidebar.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright © 2023-2026 ValidMind Inc. All rights reserved. -# Refer to the LICENSE file in the root of this repository for details. -# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial -""" -Update developer/_sidebar.yaml so the Code samples entry lists each -subdirectory of notebooks/code_samples/ explicitly (with wildcards), -in alphabetical order, with fixed capitalization for NLP and LLM. - -Run from the site/ directory, e.g.: - cd site && python scripts/update_code_samples_sidebar.py -""" - -import os -from pathlib import Path - - -# Display title for directories that need fixed capitalization (e.g. acronyms) -SPECIAL_TITLES = { - "nlp_and_llm": "NLP and LLM", -} - - -def dir_to_title(dirname: str) -> str: - """Convert directory name to sidebar display title (sentence-style capitalization).""" - if dirname in SPECIAL_TITLES: - return SPECIAL_TITLES[dirname] - return dirname.replace("_", " ").capitalize() - - -def main() -> None: - # Run from site/ or repo root - cwd = Path.cwd() - if (cwd / "notebooks" / "code_samples").is_dir(): - base = cwd - elif (cwd / "site" / "notebooks" / "code_samples").is_dir(): - base = cwd / "site" - else: - raise SystemExit("Run from site/ or repo root (e.g. cd site && python scripts/update_code_samples_sidebar.py)") - code_samples = base / "notebooks" / "code_samples" - sidebar_path = base / "developer" / "_sidebar.yaml" - - if not code_samples.is_dir(): - raise SystemExit(f"Directory not found: {code_samples}") - if not sidebar_path.is_file(): - raise SystemExit(f"Sidebar file not found: {sidebar_path}") - - subdirs = sorted( - d for d in os.listdir(code_samples) - if (code_samples / d).is_dir() - ) - - # Build the new contents block (YAML). Use "section" so Quarto renders - # expandable accordion items; "text" alone does not expand. - lines = [ - ' contents:', - ] - for d in subdirs: - title = dir_to_title(d) - lines.append(f' - section: "{title}"') - lines.append(f' contents: "notebooks/code_samples/{d}/**"') - - new_block = "\n".join(lines) - - text = sidebar_path.read_text() - # Replace either the single wildcard or an existing expanded block - old_single = ' contents: "notebooks/code_samples/**"' - if old_single in text: - text = text.replace(old_single, new_block, 1) - else: - # Find the code_samples contents block and replace it (multi-line) - import re - # Match from " contents:" through all following lines that are - # " - ..." or " contents: ..." for code_samples - pattern = re.compile( - r'( - text: "Code samples"\n' - r' file: developer/samples-jupyter-notebooks\.qmd\n)' - r' contents:\n' - r'( - (?:text|section): "[^"]+"\n' - r' contents: "notebooks/code_samples/[^"]+\*\*"\n)*', - re.MULTILINE, - ) - match = pattern.search(text) - if not match: - raise SystemExit( - "Could not find Code samples contents block in sidebar. " - "Has the sidebar format changed?" - ) - text = text[: match.start()] + match.group(1) + new_block + "\n" + text[match.end() :] - sidebar_path.write_text(text) - print(f"Updated {sidebar_path} with {len(subdirs)} code sample directories.") - - -if __name__ == "__main__": - main() diff --git a/site/training/common-slides/_validmind-test-repository.qmd b/site/training/common-slides/_validmind-test-repository.qmd index 17478fbc05..ab758f54db 100644 --- a/site/training/common-slides/_validmind-test-repository.qmd +++ b/site/training/common-slides/_validmind-test-repository.qmd @@ -2,7 +2,7 @@ Refer to the LICENSE file in the root of this repository for details. SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial --> -## {background-iframe="/developer/model-testing/test-descriptions.html" background-interactive="true" data-preload="yes"} +## {background-iframe="/developer/test-descriptions.html" background-interactive="true" data-preload="yes"} :::: {.slideover--l .three-quarters .auto-collapse-5} **{{< var vm.product >}} test repository** diff --git a/site/training/developer-fundamentals/finalizing-model-documentation.qmd b/site/training/developer-fundamentals/finalizing-model-documentation.qmd index 843d0eba13..0e53cbf143 100644 --- a/site/training/developer-fundamentals/finalizing-model-documentation.qmd +++ b/site/training/developer-fundamentals/finalizing-model-documentation.qmd @@ -200,7 +200,7 @@ For example: 2. Click on the text to edit the description for our individually inserted `HighPearsonCorrelation:balanced_raw_dataset` test: - ![Editor for a test result description](/notebooks/images/high-pearson-correlation-block.png){fig-alt="Screenshot showing the editor for a test result description" .screenshot} + ![Editor for a test result description](/notebooks/tutorials/model_development/high-pearson-correlation-block.png){fig-alt="Screenshot showing the editor for a test result description" .screenshot} ::: diff --git a/site/training/program/training-faq.qmd b/site/training/program/training-faq.qmd index 81e2951b20..85fdd0fa3c 100644 --- a/site/training/program/training-faq.qmd +++ b/site/training/program/training-faq.qmd @@ -209,7 +209,7 @@ Here's what we have been asked about {{< var vm.product >}} during training sess ::: ::: {.w-40-ns .nb3 .pl2} -[{{< fa graduation-cap >}} training](/training/developer-fundamentals/implementing-custom-tests.html#/implement-custom-tests){.button-small} [{{< fa book-open >}} docs](/notebooks/code_samples/custom_tests/implement_custom_tests.ipynb){.button-small} +[{{< fa graduation-cap >}} training](/training/developer-fundamentals/implementing-custom-tests.html#/implement-custom-tests){.button-small} [{{< fa book-open >}} docs](/notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb){.button-small} ::: :::: @@ -221,7 +221,7 @@ Here's what we have been asked about {{< var vm.product >}} during training sess ::: ::: {.w-40-ns .nb3 .pl2} -[{{< fa graduation-cap >}} training](/training/developer-fundamentals/implementing-custom-tests.html#/section-9){.button-small} [{{< fa book-open >}} docs](/notebooks/code_samples/custom_tests/integrate_external_test_providers.ipynb){.button-small} +[{{< fa graduation-cap >}} training](/training/developer-fundamentals/implementing-custom-tests.html#/section-9){.button-small} [{{< fa book-open >}} docs](/notebooks/how_to/tests/custom_tests/integrate_external_test_providers.ipynb){.button-small} ::: :::: @@ -301,7 +301,7 @@ Here's what we have been asked about {{< var vm.product >}} during training sess ::: ::: {.w-40-ns .nb3 .pl2} -[{{< fa book-open >}} code sample](/notebooks/code_samples/nlp_and_llm/llm_summarization_demo.ipynb#initialize-the-python-environment){.button-small} +[{{< fa book-open >}} code sample](/notebooks/use_cases/nlp_and_llm/llm_summarization_demo.ipynb#initialize-the-python-environment){.button-small} ::: :::: diff --git a/site/training/what-is-validmind/what-is-validmind.qmd b/site/training/what-is-validmind/what-is-validmind.qmd index a842611b6e..2e06a521bd 100644 --- a/site/training/what-is-validmind/what-is-validmind.qmd +++ b/site/training/what-is-validmind/what-is-validmind.qmd @@ -680,7 +680,7 @@ As a validator, use the same tools used to develop models to ... ::: ::: {.column width="30%" .tc .nt4} -[[{{< fa code >}} Validation Use Case]{.button .button-light-green .shadow-5-ns}](/notebooks/code_samples/model_validation/validate_application_scorecard.ipynb){target="_blank"}
+[[{{< fa code >}} Validation Use Case]{.button .button-light-green .shadow-5-ns}](/notebooks/use_cases/model_validation/validate_application_scorecard.ipynb){target="_blank"}
::: ::::