diff --git a/site/_quarto.yml b/site/_quarto.yml index c06596a41c..315500556b 100644 --- a/site/_quarto.yml +++ b/site/_quarto.yml @@ -2,6 +2,7 @@ project: type: website metadata-files: + - developer/_sidebar.yaml - training/_sidebar.yaml - validmind/_sidebar.yml - installation/_sidebar.yaml @@ -38,7 +39,7 @@ website: file: get-started/get-started.qmd - text: "Guides" file: guide/guides.qmd - - text: "{{< fa cube >}} Developers" + - text: "{{< fa cube >}} {{< var validmind.developer >}}" menu: - text: "{{< fa laptop-code >}} {{< var validmind.developer >}}" file: developer/validmind-library.qmd @@ -64,15 +65,15 @@ website: file: https://jupyterhub.validmind.ai/ - text: "---" - text: "{{< fa book >}} REFERENCE" - - text: "{{< var validmind.developer >}} API" + - text: "{{< var validmind.api >}}" file: validmind/validmind.qmd - text: "Support" file: support/support.qmd - text: "Training" file: training/training.qmd - - text: "validmind.com {{< fa external-link >}}" - file: https://validmind.com/ - target: _blank + # - text: "validmind.com {{< fa external-link >}}" + # file: https://validmind.com/ + # target: _blank right: # HOME BUTTON FOR DEVELOPER & TRAINING SECTIONS - text: "{{< fa house >}}" @@ -89,17 +90,17 @@ website: - text: "{{< fa envelope-open-text >}} Support" file: support/support.qmd - text: "---" - - text: "{{< fa cube >}} Developers" + - text: "{{< fa cube >}} Python Library" - text: "{{< fa code >}} {{< var validmind.developer >}}" file: developer/validmind-library.qmd - text: "---" - text: "{{< fa graduation-cap >}} {{< var validmind.training >}}" - text: "{{< fa building-columns >}} Training Courses" file: training/training.qmd - - text: "---" - - text: "{{< fa square-check >}} validmind.com {{< fa external-link >}}" - file: https://validmind.com/ - target: _blank + # - text: "---" + # - text: "{{< fa square-check >}} validmind.com {{< fa external-link >}}" + # file: https://validmind.com/ + # target: _blank # TRAINING MENU FOR ACADEMY SECTION - text: "{{< fa graduation-cap >}} Training" menu: @@ -324,48 +325,6 @@ website: - guide/monitoring/review-monitoring-results.qmd - text: "Metrics over time" file: guide/monitoring/work-with-metrics-over-time.qmd - - - title: "{{< var validmind.developer >}}" - contents: - # USING THE VARIABLE IN THE LINK TEXT MESSES UP THE MOBILE VIEW - - text: "ValidMind Library" - file: developer/validmind-library.qmd - - developer/supported-models.qmd - - text: "---" - - text: "QuickStart" - - notebooks/quickstart_customer_churn_full_suite.ipynb - - text: "Install and initialize ValidMind Library" - file: developer/model-documentation/install-and-initialize-validmind-library.qmd - - developer/model-documentation/store-credentials-in-env-file.qmd - - text: "---" - - text: "Model Development" - # USING THE VARIABLE IN THE LINK TEXT MESSES UP THE MOBILE VIEW & BREADCRUMB - - text: "101 Set up ValidMind Library" - file: notebooks/tutorials/model_development/101-set_up_validmind.ipynb - - text: "102 Start model development process" - file: notebooks/tutorials/model_development/102-start_development_process.ipynb - - text: "103 Integrate custom tests" - file: notebooks/tutorials/model_development/103-integrate_custom_tests.ipynb - - text: "104 Finalize testing & documentation" - file: notebooks/tutorials/model_development/104-finalize_testing_documentation.ipynb - - text: "---" - - text: "Model Testing" - - text: "Run tests & test suites" - file: developer/model-testing/testing-overview.qmd - contents: "notebooks/how_to/**" - - text: "Test descriptions" - file: developer/model-testing/test-descriptions.qmd - contents: tests/** - - developer/model-testing/test-sandbox.qmd - - text: "---" - - text: "Notebooks" - - text: "Code samples" - file: developer/samples-jupyter-notebooks.qmd - contents: "notebooks/code_samples/**" - - text: "---" - - text: "Reference" - - text: "ValidMind Library API" - file: validmind/validmind.qmd - title: "Support" contents: diff --git a/site/_variables.yml b/site/_variables.yml index 39e952e001..5df50f021e 100644 --- a/site/_variables.yml +++ b/site/_variables.yml @@ -19,7 +19,7 @@ validmind: developer: "ValidMind Library" product: "ValidMind AI risk platform" vpv: "Virtual Private ValidMind" - api: "Python Library API" + api: "ValidMind Library Python API" legal: "ValidMind Inc." training: "ValidMind Academy" diff --git a/site/developer/_metadata.yml b/site/developer/_metadata.yml index 0be10fd2f4..cb9ee5eeea 100644 --- a/site/developer/_metadata.yml +++ b/site/developer/_metadata.yml @@ -1,4 +1,10 @@ format: html: + grid: + sidebar-width: 450px + margin-width: 450px page-layout: full - css: /developer/developer.css \ No newline at end of file + from: markdown-smart + css: + - /validmind/validmind.css + - /developer/developer.css \ No newline at end of file diff --git a/site/developer/_sidebar.yaml b/site/developer/_sidebar.yaml new file mode 100644 index 0000000000..58a7b0a24e --- /dev/null +++ b/site/developer/_sidebar.yaml @@ -0,0 +1,55 @@ +website: + sidebar: + - id: validmind-library + title: "ValidMind Library" + contents: + # USING THE VARIABLE IN THE LINK TEXT MESSES UP THE MOBILE VIEW + - text: "ValidMind Library" + file: developer/validmind-library.qmd + - developer/supported-models.qmd + - text: "---" + - text: "QuickStart" + - notebooks/quickstart_customer_churn_full_suite.ipynb + - text: "Install and initialize ValidMind Library" + file: developer/model-documentation/install-and-initialize-validmind-library.qmd + - developer/model-documentation/store-credentials-in-env-file.qmd + - text: "---" + - text: "Model Development" + # USING THE VARIABLE IN THE LINK TEXT MESSES UP THE MOBILE VIEW & BREADCRUMB + - text: "1 — Set up ValidMind Library" + file: notebooks/tutorials/model_development/1-set_up_validmind.ipynb + - text: "2 — Start model development process" + file: notebooks/tutorials/model_development/2-start_development_process.ipynb + - text: "3 — Integrate custom tests" + file: notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb + - text: "4 — Finalize testing & documentation" + file: notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb + - text: "---" + - text: "Model Validation" + # USING THE VARIABLE IN THE LINK TEXT MESSES UP THE MOBILE VIEW & BREADCRUMB + - text: "1 — Set up ValidMind Library for validation" + file: notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb + - text: "2 — Start model validation process" + file: notebooks/tutorials/model_validation/2-start_validation_process.ipynb + - text: "3 — Developing a challenger model" + file: notebooks/tutorials/model_validation/3-developing_challenger_model.ipynb + - text: "4 — Finalize validation & reporting" + file: notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb + - text: "---" + - text: "Model Testing" + - text: "Run tests & test suites" + file: developer/model-testing/testing-overview.qmd + contents: "notebooks/how_to/**" + - text: "Test descriptions" + file: developer/model-testing/test-descriptions.qmd + contents: tests/** + - developer/model-testing/test-sandbox.qmd + - text: "---" + - text: "Notebooks" + - text: "Code samples" + file: developer/samples-jupyter-notebooks.qmd + contents: "notebooks/code_samples/**" + - text: "---" + - text: "Reference" + - text: "{{< var validmind.api >}}" + file: validmind/validmind.qmd diff --git a/site/developer/validmind-library.qmd b/site/developer/validmind-library.qmd index 8cc5b7e984..818d657eff 100644 --- a/site/developer/validmind-library.qmd +++ b/site/developer/validmind-library.qmd @@ -16,9 +16,9 @@ listing: grid-columns: 2 contents: - ../notebooks/quickstart_customer_churn_full_suite.ipynb - - path: https://www.youtube.com/watch?v=rIR8Mql7eGs + - path: https://youtu.be/rIR8Mql7eGs title: "{{< fa brands youtube >}} {{< var vm.product >}} QuickStart" - description: "Watch the walkthrough on YouTube: `https://www.youtube.com/watch?v=rIR8Mql7eGs`" + description: "Watch the walkthrough on YouTube: `https://youtu.be/rIR8Mql7eGs`" # - ../notebooks/tutorials/intro_for_model_developers.ipynb # - developer-getting-started-video.qmd - id: model-development @@ -26,18 +26,35 @@ listing: grid-columns: 2 max-description-length: 250 contents: - - path: ../notebooks/tutorials/model_development/101-set_up_validmind.ipynb - title: "101 Set up the {{< var validmind.developer >}}" - description: "Get to know {{< var vm.product >}} by setting up the {{< var validmind.developer >}} in your own environment and registering a sample binary classification model in the {{< var validmind.platform >}} for use with this series of notebooks." - - path: ../notebooks/tutorials/model_development/102-start_development_process.ipynb - title: "102 Start the model development process" + - path: ../notebooks/tutorials/model_development/1-set_up_validmind.ipynb + title: "1 — Set up the {{< var validmind.developer >}}" + description: "Get to know {{< var vm.product >}} by setting up the {{< var validmind.developer >}} in your own environment, and registering a sample binary classification model in the {{< var validmind.platform >}} for use with this series of notebooks." + - path: ../notebooks/tutorials/model_development/2-start_development_process.ipynb + title: "2 — Start the model development process" description: "Learn to run and log tests with a variety of methods and in different situations with the {{< var validmind.developer >}}, then add the results or evidence to your documentation for the sample model you registered." - - path: ../notebooks/tutorials/model_development/103-integrate_custom_tests.ipynb - title: "103 Integrate custom tests" + - path: ../notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb + title: "3 — Integrate custom tests" description: "After you become familiar with the basics of the {{< var validmind.developer >}}, learn how to supplement ValidMind tests with your own and include them as additional evidence in your documentation. " - - path: ../notebooks/tutorials/model_development/104-finalize_testing_documentation.ipynb - title: "104 Finalize testing and documentation" + - path: ../notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb + title: "4 — Finalize testing and documentation" description: "Wrap up by learning how to ensure that custom tests are included in your model's documentation template. By the end of this series, you will have a fully documented sample model ready for review." + - id: model-validation + type: grid + grid-columns: 2 + max-description-length: 250 + contents: + - path: ../notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb + title: "1 — Set up the {{< var validmind.developer >}} for validation" + description: "Get to know {{< var vm.product >}} by setting up the {{< var validmind.developer >}} in your own environment, and gaining access as a validator to a sample model in the {{< var validmind.platform >}} for use with this series of notebooks." + - path: ../notebooks/tutorials/model_validation/2-start_validation_process.ipynb + title: "2 — Start the model validation process" + description: "Independently verify the data quality tests performed on datasets used to train the dummy champion model using tests from the {{< var validmind.developer >}}, then add the results or evidence to your validation report." + - path: ../notebooks/tutorials/model_validation/3-developing_challenger_model.ipynb + title: "3 — Developing a potential challenger model" + description: "After you become familiar with the basics of the {{< var validmind.developer >}}, use it to develop a potential challenger model and run thorough model comparison tests, such as performance, diagnostic, and feature importance tests." + - path: ../notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb + title: "4 — Finalize validation and reporting" + description: "Wrap up by learning how to include custom tests and verifying that all tests conducted during model development were run and reported accurately. By the end of this series, you will have a validation report complete with findings ready for review." - id: run-tests grid-columns: 2 type: grid @@ -63,7 +80,7 @@ listing: max-description-length: 250 sort: false fields: [title, description] - contents: + contents: - ../guide/model-documentation/work-with-test-results.qmd - ../guide/model-documentation/work-with-content-blocks.qmd --- @@ -73,12 +90,12 @@ The {{< var validmind.developer >}} helps you streamline model documentation by ## What is the {{< var validmind.developer >}}? The {{< var validmind.developer >}} provides a rich collection of documentation tools and test suites, from documenting descriptions of your dataset to validation testing your models for weak spots and overfit areas. - + {{< var vm.product >}} offers two primary methods for automating model documentation: -- **Generate documentation** — Through automation, the {{< var vm.developer >}} extracts metadata from associated datasets and models for you and generates model documentation based on a template. You can also add more documentation and tests manually using the documentation editing capabilities in the {{< var validmind.platform >}}. +- **Generate documentation**[^1] — Through automation, the {{< var vm.developer >}} extracts metadata from associated datasets and models for you and generates model documentation based on a template. You can also add more documentation and tests manually using the documentation editing capabilities in the {{< var validmind.platform >}}. -- **Run validation tests** — The {{< var vm.developer >}} provides a suite of validation tests for common financial services use cases. For cases where these tests do not cover everything you need, you can also extend existing test suites with your own proprietary tests or testing providers. +- **Run validation tests**[^2] — The {{< var vm.developer >}} provides a suite of validation tests for common financial services use cases. For cases where these tests do not cover everything you need, you can also extend existing test suites with your own proprietary tests or testing providers. The {{< var validmind.developer >}} is designed to be model agnostic. If your model is built in Python, the {{< var vm.developer >}} provides all the standard functionality you may need without requiring you to rewrite any functions. @@ -98,6 +115,8 @@ After you [**sign up**](/guide/configuration/accessing-validmind.qmd) for {{< va :::{#library-quickstart} ::: + + ## {{< var vm.product >}} for model development Learn how to use ValidMind for your end-to-end model documentation process based on common model development scenarios with our *ValidMind for model development* series of four introductory notebooks: @@ -105,17 +124,26 @@ Learn how to use ValidMind for your end-to-end model documentation process based :::{#model-development} ::: + + +## {{< var vm.product >}} for model validation + +Learn how to use ValidMind for your end-to-end model validation process based on common scenarios with our *ValidMind for model validation* series of four introductory notebooks: + +:::{#model-validation} +::: + ## Learn how to run tests :::: {.flex .flex-wrap .justify-around} -::: {.w-80-ns} -The {{< var validmind.developer >}} provides many built-in tests and test suites which make it easy for developers to automate their model documentation. Start by running a pre-made test, then modify it, and finally create your own test: +::: {.w-70-ns} +The {{< var validmind.developer >}} provides many built-in tests and test suites which make it easy for developers to automate their model documentation. Start by running a pre-made test, then modify it, and finally create your own test: ::: -::: {.w-20-ns .tc} +::: {.w-30-ns .tc} [Run tests & test suites](model-testing/testing-overview.qmd){.button .button-green} ::: @@ -129,12 +157,12 @@ The {{< var validmind.developer >}} provides many built-in tests and test suites :::: {.flex .flex-wrap .justify-around} -::: {.w-80-ns} +::: {.w-70-ns} Our code samples showcase the capabilities of the {{< var validmind.developer >}}. Examples that you can build on and adapt for your own use cases include: ::: -::: {.w-20-ns .tc} +::: {.w-30-ns .tc} [All code samples](samples-jupyter-notebooks.qmd){.button .button-green} ::: @@ -148,12 +176,12 @@ Our code samples showcase the capabilities of the {{< var validmind.developer >} :::: {.flex .flex-wrap .justify-around} -::: {.w-70-ns} +::: {.w-60-ns} After you have tried out the {{< var validmind.developer >}}, continue working with your model documentation in the {{< var validmind.platform >}}: ::: -::: {.w-30-ns .tc} +::: {.w-40-ns .tc} [Working with model documentation](/guide/model-documentation/working-with-model-documentation.qmd){.button .button-green} ::: @@ -161,4 +189,11 @@ After you have tried out the {{< var validmind.developer >}}, continue working w :::: :::{#library-documentation} -::: \ No newline at end of file +::: + + + + +[^1]: [{{< var vm.product >}} for model development](#development) + +[^2]: [{{< var vm.product >}} for model validation](#validation) \ No newline at end of file diff --git a/site/notebooks.zip b/site/notebooks.zip index 3daf454cab..6190e6637c 100644 Binary files a/site/notebooks.zip and b/site/notebooks.zip differ diff --git a/site/notebooks/_metadata.yml b/site/notebooks/_metadata.yml index 0be10fd2f4..cb9ee5eeea 100644 --- a/site/notebooks/_metadata.yml +++ b/site/notebooks/_metadata.yml @@ -1,4 +1,10 @@ format: html: + grid: + sidebar-width: 450px + margin-width: 450px page-layout: full - css: /developer/developer.css \ No newline at end of file + from: markdown-smart + css: + - /validmind/validmind.css + - /developer/developer.css \ No newline at end of file diff --git a/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models.ipynb b/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models.ipynb index b019c7b457..6da5cffdef 100644 --- a/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models.ipynb +++ b/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models.ipynb @@ -82,7 +82,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models_quantlib.ipynb b/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models_quantlib.ipynb index e251918470..a8548826bb 100644 --- a/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models_quantlib.ipynb +++ b/site/notebooks/code_samples/capital_markets/quickstart_option_pricing_models_quantlib.ipynb @@ -120,7 +120,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_demo.ipynb b/site/notebooks/code_samples/credit_risk/application_scorecard_demo.ipynb index 5bb5985f4e..1bc8aa931b 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_demo.ipynb +++ b/site/notebooks/code_samples/credit_risk/application_scorecard_demo.ipynb @@ -86,7 +86,7 @@ "\n", "\n", "### New to ValidMind?\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb b/site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb index 349cfd30c6..8025ece67c 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb +++ b/site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb @@ -37,7 +37,7 @@ "\n", "\n", "### New to ValidMind?\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb b/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb index 2c91302c14..77e5b2ba07 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb +++ b/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb @@ -37,7 +37,7 @@ "\n", "\n", "### New to ValidMind?\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_with_bias.ipynb b/site/notebooks/code_samples/credit_risk/application_scorecard_with_bias.ipynb index 0d6f4e270e..0b909ddda8 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_with_bias.ipynb +++ b/site/notebooks/code_samples/credit_risk/application_scorecard_with_bias.ipynb @@ -75,7 +75,7 @@ "\n", "\n", "### New to ValidMind?\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n" + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n" ] }, { diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb b/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb index ca1bdb4e36..961a92c5c6 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb +++ b/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb @@ -37,7 +37,7 @@ "\n", "\n", "### New to ValidMind?\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/code_samples/custom_tests/implement_custom_tests.ipynb b/site/notebooks/code_samples/custom_tests/implement_custom_tests.ipynb index 34ed27e575..80393d5f31 100644 --- a/site/notebooks/code_samples/custom_tests/implement_custom_tests.ipynb +++ b/site/notebooks/code_samples/custom_tests/implement_custom_tests.ipynb @@ -78,7 +78,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/code_samples/model_validation/validate_application_scorecard.ipynb b/site/notebooks/code_samples/model_validation/validate_application_scorecard.ipynb new file mode 100644 index 0000000000..2ac83e3690 --- /dev/null +++ b/site/notebooks/code_samples/model_validation/validate_application_scorecard.ipynb @@ -0,0 +1,1831 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Validate an application scorecard model\n", + "\n", + "Learn how to independently assess an application scorecard model developed using the ValidMind Library as a validator. You'll evaluate the development of the model by conducting thorough testing and analysis, including the use of challenger models to benchmark performance.\n", + "\n", + "An *application scorecard model* is a type of statistical model used in credit scoring to evaluate the creditworthiness of potential borrowers by generating a score based on various characteristics of an applicant such as credit history, income, employment status, and other relevant financial data.\n", + "\n", + " - This score assists lenders in making informed decisions about whether to approve or reject loan applications, as well as in determining the terms of the loan, including interest rates and credit limits.\n", + " - Effective validation of application scorecard models ensures that lenders can manage risk efficiently while maintaining a fast and transparent loan application process for applicants.\n", + "\n", + "This interactive notebook provides a step-by-step guide for:\n", + "\n", + "- Verifying the data quality steps performed by the model development team\n", + "- Independently replicating the champion model's results and conducting additional tests to assess performance, stability, and robustness\n", + "- Setting up test inputs and challenger models for comparative analysis\n", + "- Running validation tests, analyzing results, and logging findings to ValidMind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1_) \n", + " - [Before you begin](#toc1_1_) \n", + " - [New to ValidMind?](#toc1_2_) \n", + " - [Key concepts](#toc1_3_) \n", + "- [Setting up](#toc2_) \n", + " - [Register a sample model](#toc2_1_) \n", + " - [Assign validator credentials](#toc2_1_1_) \n", + " - [Install the ValidMind Library](#toc2_2_) \n", + " - [Initialize the ValidMind Library](#toc2_3_) \n", + " - [Get your code snippet](#toc2_3_1_) \n", + " - [Importing the champion model](#toc2_4_) \n", + " - [Load the sample dataset](#toc2_5_) \n", + " - [Preprocess the dataset](#toc2_5_1_) \n", + " - [Apply feature engineering to the dataset](#toc2_5_2_) \n", + " - [Split the feature engineered dataset](#toc2_6_) \n", + "- [Developing potential challenger models](#toc3_) \n", + " - [Train potential challenger models](#toc3_1_) \n", + " - [Random forest classification model](#toc3_1_1_) \n", + " - [Logistic regression model](#toc3_1_2_) \n", + " - [Extract predicted probabilities](#toc3_2_) \n", + " - [Compute binary predictions](#toc3_2_1_) \n", + "- [Initializing the ValidMind objects](#toc4_) \n", + " - [Initialize the ValidMind datasets](#toc4_1_) \n", + " - [Initialize the model objects](#toc4_2_) \n", + " - [Assign predictions](#toc4_3_) \n", + " - [Compute credit risk scores](#toc4_4_) \n", + "- [Run data quality tests](#toc5_) \n", + " - [Run and log an individual data quality test](#toc5_1_) \n", + " - [Log multiple data quality tests](#toc5_2_) \n", + " - [Run data quality comparison tests](#toc5_3_) \n", + "- [Run performance tests](#toc6_) \n", + " - [Identify performance tests](#toc6_1_) \n", + " - [Run and log an individual performance test](#toc6_2_) \n", + " - [Log multiple performance tests](#toc6_3_) \n", + " - [Evaluate performance of the champion model](#toc6_4_) \n", + " - [Evaluate performance of challenger models](#toc6_5_) \n", + " - [Enable custom context for test descriptions](#toc6_5_1_) \n", + " - [Run performance comparison tests](#toc6_5_2_) \n", + "- [Adjust a ValidMind test](#toc7_) \n", + "- [Run diagnostic tests](#toc8_) \n", + "- [Run feature importance tests](#toc9_) \n", + "- [Implement a custom test](#toc10_) \n", + "- [Verify test runs](#toc11_) \n", + "- [Next steps](#toc12_) \n", + " - [Work with your validation report](#toc12_1_) \n", + " - [Discover more learning resources](#toc12_2_) \n", + "- [Upgrade ValidMind](#toc13_) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate comparison and other validation tests, and then use the ValidMind Platform to submit compliance assessments of champion models via comprehensive validation reports. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model developers." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, create a free ValidMind account.\n", + "

\n", + "Signing up is FREE — Register with ValidMind
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Validation report**: A comprehensive and structured assessment of a model’s development and performance, focusing on verifying its integrity, appropriateness, and alignment with its intended use. It includes analyses of model assumptions, data quality, performance metrics, outcomes of testing procedures, and risk considerations. The validation report supports transparency, regulatory compliance, and informed decision-making by documenting the validator’s independent review and conclusions.\n", + "\n", + "**Validation report template**: Serves as a standardized framework for conducting and documenting model validation activities. It outlines the required sections, recommended analyses, and expected validation tests, ensuring consistency and completeness across validation reports. The template helps guide validators through a systematic review process while promoting comparability and traceability of validation outcomes.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets.\n", + "\n", + "**Metrics**: A subset of tests that do not have thresholds. In the context of this notebook, metrics and tests can be thought of as interchangeable concepts.\n", + "\n", + "**Custom metrics**: Custom metrics are functions that you define to evaluate your model or dataset. These functions can be registered with the ValidMind Library to be used in the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom metrics can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Register a sample model\n", + "\n", + "In a usual model lifecycle, a champion model will have been independently registered in your model inventory and submitted to you for validation by your model development team as part of the effective challenge process. (**Learn more:** [Submit for approval](https://docs.validmind.ai/guide/model-documentation/submit-for-approval.html))\n", + "\n", + "For this notebook, we'll have you register a dummy model in the ValidMind Platform inventory and assign yourself as the validator to familiarize you with the ValidMind interface and circumvent the need for an existing model:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + " For example, to register a model for use with this notebook, select:\n", + "\n", + " - Documentation template: `Credit Risk Scorecard`\n", + " - Use case: `Credit Risk — CECL`\n", + "\n", + " You can fill in other options according to your preference." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Assign validator credentials\n", + "\n", + "In order to log tests as a validator instead of as a developer, on the model details page that appears after you've successfully registered your sample model:\n", + "\n", + "1. Remove yourself as a developer: \n", + "\n", + " - Click on the **DEVELOPERS** tile.\n", + " - Click the **x** next to your name to remove yourself from that model's role.\n", + " - Click **Save** to apply your changes to that role.\n", + "\n", + "2. Add yourself as a validator: \n", + "\n", + " - Click on the **VALIDATORS** tile.\n", + " - Select your name from the drop-down menu.\n", + " - Click **Save** to apply your changes to that role." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "
Recommended Python versions\n", + "

\n", + "Python 3.8 <= x <= 3.11
\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your validation environment. You initialize the ValidMind Library with this code snippet, which ensures that your test results are uploaded to the correct model when you run the notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and select the model you registered for this notebook.\n", + "\n", + "3. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Importing the champion model\n", + "\n", + "With the ValidMind Library set up and ready to go, let's go ahead and import the champion model submitted by the model development team in the format of a `.pkl` file: **[xgb_model_champion.pkl](xgb_model_champion.pkl)**\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "\n", + "#Load the saved model\n", + "xgb_model = xgb.XGBClassifier()\n", + "xgb_model.load_model(\"xgb_model_champion.pkl\")\n", + "xgb_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure that we have to appropriate order in feature names from Champion model and dataset\n", + "cols_when_model_builds = xgb_model.get_booster().feature_names" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Load the sample dataset\n", + "\n", + "Let's next import the public [Lending Club](https://www.kaggle.com/datasets/devanshi23/loan-data-2007-2014/data) dataset from Kaggle, which was used to develop the dummy champion model.\n", + "\n", + "- We'll use this dataset to review steps that should have been conducted during the initial development and documentation of the model to ensure that the model was built correctly.\n", + "- By independently performing steps such as preprocessing and feature engineering, we can confirm whether the model was built using appropriate and properly processed data.\n", + "\n", + "To be able to use the dataset, you'll need to import the dataset and load it into a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), a two-dimensional tabular data structure that makes use of rows and columns:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.datasets.credit_risk import lending_club\n", + "\n", + "df = lending_club.load_data(source=\"offline\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Preprocess the dataset\n", + "\n", + "We'll first quickly preprocess the dataset for data quality testing purposes using `lending_club.preprocess`. This function performs the following operations:\n", + "\n", + "- Filters the dataset to include only loans for debt consolidation or credit card purposes\n", + "- Removes loans classified under the riskier grades \"F\" and \"G\"\n", + "- Excludes uncommon home ownership types and standardizes employment length and loan terms into numerical formats\n", + "- Discards unnecessary fields and any entries with missing information to maintain a clean and robust dataset for modeling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preprocess_df = lending_club.preprocess(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply feature engineering to the dataset\n", + "\n", + "Feature engineering improves the dataset's structure to better match what our model expects, and ensures that the model performs optimally by leveraging additional insights from raw data.\n", + "\n", + "We'll apply the following transformations using the `ending_club.feature_engineering()` function to optimize the dataset for predictive modeling in our application scorecard:\n", + "\n", + "- **WoE encoding**: Converts both numerical and categorical features into Weight of Evidence (WoE) values. WoE is a statistical measure used in scorecard modeling that quantifies the relationship between a predictor variable and the binary target variable. It calculates the ratio of the distribution of good outcomes to the distribution of bad outcomes for each category or bin of a feature. This transformation helps to ensure that the features are predictive and consistent in their contribution to the model.\n", + "- **Integration of WoE bins**: Ensures that the WoE transformed values are integrated throughout the dataset, replacing the original feature values while excluding the target variable from this transformation. This transformation is used to maintain a consistent scale and impact of each variable within the model, which helps make the predictions more stable and accurate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fe_df = lending_club.feature_engineering(preprocess_df)\n", + "fe_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Split the feature engineered dataset\n", + "\n", + "With our dummy model imported and our independently preprocessed and feature engineered dataset ready to go, let's now **spilt our dataset into train and test** to start the validation testing process.\n", + "\n", + "Splitting our dataset into training and testing is essential for proper validation testing, as this helps assess how well the model generalizes to unseen data:\n", + "\n", + "- We begin by dividing our data, which is based on Weight of Evidence (WoE) features, into training and testing sets (`train_df`, `test_df`).\n", + "- With `lending_club.split`, we employ a simple random split, randomly allocating data points to each set to ensure a mix of examples in both." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Split the data\n", + "train_df, test_df = lending_club.split(fe_df, test_size=0.2)\n", + "\n", + "x_train = train_df.drop(lending_club.target_column, axis=1)\n", + "y_train = train_df[lending_club.target_column]\n", + "\n", + "x_test = test_df.drop(lending_club.target_column, axis=1)\n", + "y_test = test_df[lending_club.target_column]\n", + "\n", + "# Now let's apply the order of features from the champion model construction\n", + "x_train = x_train[cols_when_model_builds]\n", + "x_test = x_test[cols_when_model_builds]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cols_use = ['annual_inc_woe',\n", + " 'verification_status_woe',\n", + " 'emp_length_woe',\n", + " 'installment_woe',\n", + " 'term_woe',\n", + " 'home_ownership_woe',\n", + " 'purpose_woe',\n", + " 'open_acc_woe',\n", + " 'total_acc_woe',\n", + " 'int_rate_woe',\n", + " 'sub_grade_woe',\n", + " 'grade_woe','loan_status']\n", + "\n", + "\n", + "train_df = train_df[cols_use]\n", + "test_df = test_df[cols_use]\n", + "test_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Developing potential challenger models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Train potential challenger models\n", + "\n", + "We're curious how alternate models compare to our champion model, so let's train two challenger models as basis for our testing.\n", + "\n", + "Our selected options below offer decreased complexity in terms of implementation — such as lessened manual preprocessing — which can reduce the amount of risk for implementation. However, model risk is not calculated in isolation from a single factor, but rather in consideration with trade-offs in predictive performance, ease of interpretability, and overall alignment with business objectives." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Random forest classification model\n", + "\n", + "A *random forest classification model* is an ensemble machine learning algorithm that uses multiple decision trees to classify data. In ensemble learning, multiple models are combined to improve prediction accuracy and robustness.\n", + "\n", + "Random forest classification models generally have higher accuracy because they capture complex, non-linear relationships, but as a result they lack transparency in their predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the Random Forest Classification model\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "# Create the model instance with 50 decision trees\n", + "rf_model = RandomForestClassifier(\n", + " n_estimators=50,\n", + " random_state=42,\n", + ")\n", + "\n", + "# Train the model\n", + "rf_model.fit(x_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Logistic regression model\n", + "\n", + "A *logistic regression model* is a statistical machine learning algorithm that uses a linear equation (straight-line relationship between variables) and the logistic function (or sigmoid function, which maps any real-valued number to a range between `0` and `1`) to classify data. In statistical modeling, a single equation is used to estimate the probability of an outcome based on input features.\n", + "\n", + "Logistic regression models are simple and interpretable because they provide clear probability estimates and feature coefficients (numerical value that represents the influence of a particular input feature on the model's prediction), but they may struggle with capturing complex, non-linear relationships in the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the Logistic Regression model\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "# Logistic Regression grid params\n", + "log_reg_params = {\n", + " \"penalty\": [\"l1\", \"l2\"],\n", + " \"C\": [0.001, 0.01, 0.1, 1, 10, 100, 1000],\n", + " \"solver\": [\"liblinear\"],\n", + "}\n", + "\n", + "# Grid search for Logistic Regression\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)\n", + "grid_log_reg.fit(x_train, y_train)\n", + "\n", + "# Logistic Regression best estimator\n", + "log_reg = grid_log_reg.best_estimator_\n", + "log_reg" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Extract predicted probabilities\n", + "\n", + "With our challenger models trained, let's extract the predicted probabilities from our three models:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Champion — Application scorecard model\n", + "train_xgb_prob = xgb_model.predict_proba(x_train)[:, 1]\n", + "test_xgb_prob = xgb_model.predict_proba(x_test)[:, 1]\n", + "\n", + "# Challenger — Random forest classification model\n", + "train_rf_prob = rf_model.predict_proba(x_train)[:, 1]\n", + "test_rf_prob = rf_model.predict_proba(x_test)[:, 1]\n", + "\n", + "# Challenger — Logistic regression model\n", + "train_log_prob = log_reg.predict_proba(x_train)[:, 1]\n", + "test_log_prob = log_reg.predict_proba(x_test)[:, 1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Compute binary predictions\n", + "\n", + "Next, we'll convert the probability predictions from our three models into a binary, based on a threshold of `0.3`:\n", + "\n", + "- If the probability is greater than `0.3`, the prediction becomes `1` (positive).\n", + "- Otherwise, it becomes `0` (negative)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cut_off_threshold = 0.3\n", + "\n", + "# Champion — Application scorecard model\n", + "train_xgb_binary_predictions = (train_xgb_prob > cut_off_threshold).astype(int)\n", + "test_xgb_binary_predictions = (test_xgb_prob > cut_off_threshold).astype(int)\n", + "\n", + "# Challenger — Random forest classification model\n", + "train_rf_binary_predictions = (train_rf_prob > cut_off_threshold).astype(int)\n", + "test_rf_binary_predictions = (test_rf_prob > cut_off_threshold).astype(int)\n", + "\n", + "# Challenger — Logistic regression model\n", + "train_log_binary_predictions = (train_log_prob > cut_off_threshold).astype(int)\n", + "test_log_binary_predictions = (test_log_prob > cut_off_threshold).astype(int)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Initializing the ValidMind objects" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind datasets\n", + "\n", + "Before you can run tests, you'll need to connect your data with a ValidMind `Dataset` object. **This step is always necessary every time you want to connect a dataset to documentation and produce test results through ValidMind,** but you only need to do it once per dataset.\n", + "\n", + "Initialize a ValidMind dataset object using the [`init_dataset` function](https://docs.validmind.ai/validmind/validmind.html#init_dataset) from the ValidMind (`vm`) module. For this example, we'll pass in the following arguments:\n", + "\n", + "- **`dataset`** — The raw dataset that you want to provide as input to tests.\n", + "- **`input_id`** — A unique identifier that allows tracking what inputs are used when running each individual test.\n", + "- **`target_column`** — A required argument if tests require access to true values. This is the name of the target column in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the raw dataset\n", + "vm_raw_dataset = vm.init_dataset(\n", + " dataset=df,\n", + " input_id=\"raw_dataset\",\n", + " target_column=lending_club.target_column,\n", + ")\n", + "\n", + "# Initialize the preprocessed dataset\n", + "vm_preprocess_dataset = vm.init_dataset(\n", + " dataset=preprocess_df,\n", + " input_id=\"preprocess_dataset\",\n", + " target_column=lending_club.target_column,\n", + ")\n", + "\n", + "# Initialize the feature engineered dataset\n", + "vm_fe_dataset = vm.init_dataset(\n", + " dataset=fe_df,\n", + " input_id=\"fe_dataset\",\n", + " target_column=lending_club.target_column,\n", + ")\n", + "\n", + "# Initialize the training dataset\n", + "vm_train_ds = vm.init_dataset(\n", + " dataset=train_df,\n", + " input_id=\"train_dataset\",\n", + " target_column=lending_club.target_column,\n", + ")\n", + "\n", + "# Initialize the test dataset\n", + "vm_test_ds = vm.init_dataset(\n", + " dataset=test_df,\n", + " input_id=\"test_dataset\",\n", + " target_column=lending_club.target_column,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After initialization, you can pass the ValidMind `Dataset` objects `vm_raw_dataset`, `vm_preprocess_dataset`, `vm_fe_dataset`, `vm_train_ds`, and `vm_test_ds` into any ValidMind tests." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the model objects\n", + "\n", + "You'll also need to initialize a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data for each of our three models.\n", + "\n", + "You simply initialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the champion application scorecard model\n", + "vm_xgb_model = vm.init_model(\n", + " xgb_model,\n", + " input_id=\"xgb_model_developer_champion\",\n", + ")\n", + "\n", + "# Initialize the challenger random forest classification model\n", + "vm_rf_model = vm.init_model(\n", + " rf_model,\n", + " input_id=\"rf_model\",\n", + ")\n", + "\n", + "# Initialize the challenger logistic regression model\n", + "vm_log_model = vm.init_model(\n", + " log_reg,\n", + " input_id=\"log_model\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign predictions\n", + "\n", + "With our models registered, we'll move on to assigning both the predictive probabilities coming directly from each model's predictions, and the binary prediction after applying the cutoff threshold described in the Compute binary predictions step above.\n", + "\n", + "- The [`assign_predictions()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#VMDataset.assign_predictions) from the `Dataset` object can link existing predictions to any number of models.\n", + "- This method links the model's class prediction values and probabilities to our `vm_train_ds` and `vm_test_ds` datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Champion — Application scorecard model\n", + "vm_train_ds.assign_predictions(\n", + " model=vm_xgb_model,\n", + " prediction_values=train_xgb_binary_predictions,\n", + " prediction_probabilities=train_xgb_prob,\n", + ")\n", + "\n", + "vm_test_ds.assign_predictions(\n", + " model=vm_xgb_model,\n", + " prediction_values=test_xgb_binary_predictions,\n", + " prediction_probabilities=test_xgb_prob,\n", + ")\n", + "\n", + "# Challenger — Random forest classification model\n", + "vm_train_ds.assign_predictions(\n", + " model=vm_rf_model,\n", + " prediction_values=train_rf_binary_predictions,\n", + " prediction_probabilities=train_rf_prob,\n", + ")\n", + "\n", + "vm_test_ds.assign_predictions(\n", + " model=vm_rf_model,\n", + " prediction_values=test_rf_binary_predictions,\n", + " prediction_probabilities=test_rf_prob,\n", + ")\n", + "\n", + "\n", + "# Challenger — Logistic regression model\n", + "vm_train_ds.assign_predictions(\n", + " model=vm_log_model,\n", + " prediction_values=train_log_binary_predictions,\n", + " prediction_probabilities=train_log_prob,\n", + ")\n", + "\n", + "vm_test_ds.assign_predictions(\n", + " model=vm_log_model,\n", + " prediction_values=test_log_binary_predictions,\n", + " prediction_probabilities=test_log_prob,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Compute credit risk scores\n", + "\n", + "Finally, we'll translate model predictions into actionable scores using probability estimates generated by our trained model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the scores\n", + "train_xgb_scores = lending_club.compute_scores(train_xgb_prob)\n", + "test_xgb_scores = lending_club.compute_scores(test_xgb_prob)\n", + "train_rf_scores = lending_club.compute_scores(train_rf_prob)\n", + "test_rf_scores = lending_club.compute_scores(test_rf_prob)\n", + "train_log_scores = lending_club.compute_scores(train_log_prob)\n", + "test_log_scores = lending_club.compute_scores(test_log_prob)\n", + "\n", + "# Assign scores to the datasets\n", + "vm_train_ds.add_extra_column(\"xgb_scores\", train_xgb_scores)\n", + "vm_test_ds.add_extra_column(\"xgb_scores\", test_xgb_scores)\n", + "vm_train_ds.add_extra_column(\"rf_scores\", train_rf_scores)\n", + "vm_test_ds.add_extra_column(\"rf_scores\", test_rf_scores)\n", + "vm_train_ds.add_extra_column(\"log_scores\", train_log_scores)\n", + "vm_test_ds.add_extra_column(\"log_scores\", test_log_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Run data quality tests\n", + "\n", + "With everything ready to go, let's explore some of ValidMind's available tests. Using ValidMind’s repository of tests streamlines your validation testing, and helps you ensure that your models are being validated appropriately.\n", + "\n", + "We want to narrow down the tests we want to run from the selection provided by ValidMind, so we'll use the [`vm.tests.list_tasks_and_tags()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tasks_and_tags) to list which `tags` are associated with each `task` type:\n", + "\n", + "- **`tasks`** represent the kind of modeling task associated with a test. Here we'll focus on `classification` tasks.\n", + "- **`tags`** are free-form descriptions providing more details about the test, for example, what category the test falls into. Here we'll focus on the `data_quality` tag." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.list_tasks_and_tags()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we'll call [the `vm.tests.list_tests()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tests) to list all the data quality tests for classification:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.list_tests(\n", + " tags=[\"data_quality\"], task=\"classification\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Want to learn more about navigating ValidMind tests?\n", + "

\n", + "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run and log an individual data quality test\n", + "\n", + "Next, we'll use our previously initialized preprocessed dataset (`vm_preprocess_dataset`) as input to run an individual test, then log the result to the ValidMind Platform.\n", + "\n", + "- You run validation tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module.\n", + "- Every test result returned by the `run_test()` function has a [`.log()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#TestResult.log) that can be used to send the test results to the ValidMind Platform.\n", + "\n", + "Here, we'll use the [`HighPearsonCorrelation` test](https://docs.validmind.ai/tests/data_validation/HighPearsonCorrelation.html) as an example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " test_id=\"validmind.data_validation.HighPearsonCorrelation\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Note the output returned indicating that a test-driven block doesn't currently exist in your model's documentation for some test IDs. \n", + "

\n", + "That's expected, as when we run validations tests the results logged need to be manually added to your report as part of your compliance assessment process within the ValidMind Platform. You'll continue to see this message throughout this notebook as we run and log more tests.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Log multiple data quality tests\n", + "\n", + "Now that we understand how to run a test with ValidMind, we want to run all the tests that were returned for our `classification` tasks focusing on `data_quality`.\n", + "\n", + "We'll store the identified tests in `dq` in preparation for batch running these tests and logging their results to the ValidMind Platform:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dq = vm.tests.list_tests(tags=[\"data_quality\"], task=\"classification\",pretty=False)\n", + "dq" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "With our data quality tests stored, let's run our first batch of tests using the same preprocessed dataset (`vm_preprocess_dataset`) and log their results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for test in dq:\n", + " vm.tests.run_test(\n", + " test,\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset\n", + " }\n", + " ).log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run data quality comparison tests\n", + "\n", + "Next, let's reuse the tests in `dq` to perform comparison tests between the raw (`vm_raw_dataset`) and preprocessed (`vm_preprocess_dataset`) dataset, again logging the results to the ValidMind Platform:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for test in dq:\n", + " vm.tests.run_test(\n", + " test,\n", + " input_grid={\n", + " \"dataset\": [vm_raw_dataset,vm_preprocess_dataset]\n", + " }\n", + " ).log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Run performance tests\n", + "\n", + "We'll also run some performance tests, beginning with independent testing of our champion application scorecard model, then moving on to our potential challenger models." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Identify performance tests\n", + "\n", + "Use `vm.tests.list_tests()` to this time identify all the model performance tests for classification:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "vm.tests.list_tests(tags=[\"model_performance\"], task=\"classification\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run and log an individual performance test\n", + "\n", + "Before we run our batch of performance tests, we'll use our previously initialized testing dataset (`vm_test_ds`) as input to run an individual test, then log the result to the ValidMind Platform.\n", + "\n", + "When running individual tests, you can use a custom `result_id` to tag the individual result with a unique identifier by appending this `result_id` to the `test_id` with a `:` separator. We'll append an identifier for our champion model here (`xgboost_champion`):\n", + "\n", + "Here, we'll use the [`ClassifierPerformance` test](https://docs.validmind.ai/tests/model_validation/sklearn/ClassifierPerformance.html) as an example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " test_id=\"validmind.model_validation.sklearn.ClassifierPerformance:xgboost_champion\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds, \"model\" : vm_xgb_model\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Log multiple performance tests\n", + "\n", + "We only want to run a few other tests that were returned for our `classification` tasks focusing on `model_performance`, so we'll isolate the specific tests we want to batch run in `mpt`:\n", + "\n", + "- `ClassifierPerformance`\n", + "- [`ConfusionMatrix`](https://docs.validmind.ai/tests/model_validation/sklearn/ConfusionMatrix.html)\n", + "- [`MinimumAccuracy`](https://docs.validmind.ai/tests/model_validation/sklearn/MinimumAccuracy.html)\n", + "- [`MinimumF1Score`](https://docs.validmind.ai/tests/model_validation/sklearn/MinimumF1Score.html)\n", + "- [`ROCCurve`](https://docs.validmind.ai/tests/model_validation/sklearn/ROCCurve.html)\n", + "\n", + "Note the custom `result_id`s appended to the `test_id`s for our champion model (`xgboost_champion`):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mpt = [\n", + " \"validmind.model_validation.sklearn.ClassifierPerformance:xgboost_champion\",\n", + " \"validmind.model_validation.sklearn.ConfusionMatrix:xgboost_champion\",\n", + " \"validmind.model_validation.sklearn.MinimumAccuracy:xgboost_champion\",\n", + " \"validmind.model_validation.sklearn.MinimumF1Score:xgboost_champion\",\n", + " \"validmind.model_validation.sklearn.ROCCurve:xgboost_champion\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Evaluate performance of the champion model\n", + "\n", + "Now, let's run and log our batch of model performance tests using our testing dataset (`vm_test_ds`) for our champion model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for test in mpt:\n", + " vm.tests.run_test(\n", + " test,\n", + " inputs={\n", + " \"dataset\": vm_test_ds, \"model\" : vm_xgb_model\n", + " },\n", + " ).log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Evaluate performance of challenger models\n", + "\n", + "We've now conducted similar tests as the model development team for our champion model, with the aim of verifying their test results.\n", + "\n", + "Next, let's see how our challenger models compare. We'll use the same batch of tests here as we did in `mpt`, but append a different `result_id` to indicate that these results should be associated with our challenger models:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mpt_chall = [\n", + " \"validmind.model_validation.sklearn.ClassifierPerformance:xgboost_champion_vs_challengers\",\n", + " \"validmind.model_validation.sklearn.ConfusionMatrix:xgboost_champion_vs_challengers\",\n", + " \"validmind.model_validation.sklearn.MinimumAccuracy:xgboost_champion_vs_challengers\",\n", + " \"validmind.model_validation.sklearn.MinimumF1Score:xgboost_champion_vs_challengers\",\n", + " \"validmind.model_validation.sklearn.ROCCurve:xgboost_champion_vs_challengers\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Enable custom context for test descriptions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When you run ValidMind tests, test descriptions are automatically generated with LLM using the test results, the test name, and the static test definitions provided in the test’s docstring. While this metadata offers valuable high-level overviews of tests, insights produced by the LLM-based descriptions may not always align with your specific use cases or incorporate organizational policy requirements.\n", + "\n", + "Before we run our next batch of tests, we'll include some custom use case context to focus on comparison testing going forward, improving the relevancy, insight, and format of the test descriptions returned. By default, custom context for LLM-generated descriptions is disabled, meaning that the output will not include any additional context. To enable custom use case context, set the `VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED` environment variable to `1`.\n", + "\n", + "This is a global setting that will affect all tests for your linked model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED\"] = \"1\"" + ] + }, + { + "cell_type": "markdown", + "id": "0d1e90ba", + "metadata": {}, + "source": [ + "Enabling use case context allows you to pass in additional context to the LLM-generated text descriptions within `context`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED\"] = \"1\"\n", + "\n", + "context = \"\"\"\n", + "FORMAT FOR THE LLM DESCRIPTIONS: \n", + " **** is designed to .\n", + "\n", + " The test operates by \n", + "\n", + " The primary advantages of this test include \n", + "\n", + " Users should be aware that \n", + "\n", + " **Key Insights:**\n", + "\n", + " The test results reveal:\n", + "\n", + " - ****: \n", + " - ****: \n", + " ...\n", + "\n", + " Based on these results, \n", + "\n", + "ADDITIONAL INSTRUCTIONS:\n", + "\n", + " The champion model as the basis for comparison is called \"xgb_model_developer_champion\" and emphasis should be on the following:\n", + " - The metrics for the champion model compared against the challenger models\n", + " - Which model potentially outperforms the champion model based on the metrics, this should be highlighted and emphasized\n", + "\n", + "\n", + " For each metric in the test results, include in the test overview:\n", + " - The metric's purpose and what it measures\n", + " - Its mathematical formula\n", + " - The range of possible values\n", + " - What constitutes good/bad performance\n", + " - How to interpret different values\n", + "\n", + " Each insight should progressively cover:\n", + " 1. Overall scope and distribution\n", + " 2. Complete breakdown of all elements with specific values\n", + " 3. Natural groupings and patterns\n", + " 4. Comparative analysis between datasets/categories\n", + " 5. Stability and variations\n", + " 6. Notable relationships or dependencies\n", + "\n", + " Remember:\n", + " - Champion model (xgb_model_developer_champion) is the selection and challenger models are used to challenge the selection\n", + " - Keep all insights at the same level (no sub-bullets or nested structures)\n", + " - Make each insight complete and self-contained\n", + " - Include specific numerical values and ranges\n", + " - Cover all elements in the results comprehensively\n", + " - Maintain clear, concise language\n", + " - Use only \"- **Title**: Description\" format for insights\n", + " - Progress naturally from general to specific observations\n", + "\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Want to learn more about setting custom context for LLM-generated test descriptions?\n", + "

\n", + "Refer to our extended walkthrough notebook: Add context to LLM-generated test descriptions\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Run performance comparison tests\n", + "\n", + "With the use case context set, we'll run each test in `mpt_chall` once for each model with the same `vm_test_ds` dataset to compare them:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for test in mpt_chall:\n", + " vm.tests.run_test(\n", + " test,\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds], \"model\" : [vm_xgb_model,vm_log_model,vm_rf_model]\n", + " }\n", + " ).log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Based on the performance metrics, we can conclude that the random forest classification model is not a viable candidate for our use case and can be disregarded in our tests going forward.\n", + "

\n", + "In the next section, we'll dive a bit deeper into some tests comparing our champion application scorecard model and our remaining challenger logistic regression model, including tests that will allow us to customize parameters and thresholds for performance standards.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Adjust a ValidMind test\n", + "\n", + "Let's dig deeper into the `MinimumF1Score` test we ran previously in Run performance tests to ensure that the models maintain a minimum acceptable balance between *precision* and *recall*. Precision refers to how many out of the positive predictions made by the model were actually correct, and recall refers to how many out of the actual positive cases did the model correctly identify.\n", + "\n", + "Use `run_test()` with our testing dataset (`vm_test_ds`) to run the test in isolation again for our two remaining models without logging the result to have the output to compare with a subsequent iteration:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.model_validation.sklearn.MinimumF1Score:xgboost_champion_vs_challengers\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_xgb_model, vm_log_model]\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As `MinimumF1Score` allows us to customize parameters and thresholds for performance standards, let's adjust the threshold to see if it improves metrics:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = vm.tests.run_test(\n", + " \"validmind.model_validation.sklearn.MinimumF1Score:AdjThreshold\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_xgb_model, vm_log_model],\n", + " \"params\": {\"min_threshold\": 0.35}\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Run diagnostic tests\n", + "\n", + "Next we want to inspect the robustness and stability testing comparison between our champion and challenger model.\n", + "\n", + "Use `list_tests()` to identify all the model diagnosis tests for classification:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.list_tests(tags=[\"model_diagnosis\"], task=\"classification\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see if models suffer from any *overfit* potentials and also where there are potential sub-segments of issues with the [`OverfitDiagnosis` test](https://docs.validmind.ai/tests/model_validation/sklearn/OverfitDiagnosis.html). \n", + "\n", + "Overfitting occurs when a model learns the training data too well, capturing not only the true pattern but noise and random fluctuations resulting in excellent performance on the training dataset but poor generalization to new, unseen data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " test_id=\"validmind.model_validation.sklearn.OverfitDiagnosis:Champion_vs_LogRegression\",\n", + " input_grid={\n", + " \"datasets\": [[vm_train_ds,vm_test_ds]],\n", + " \"model\" : [vm_xgb_model,vm_log_model]\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's also conduct *robustness* and *stability* testing of the two models with the [`RobustnessDiagnosis` test](https://docs.validmind.ai/tests/model_validation/sklearn/RobustnessDiagnosis.html).\n", + "\n", + "Robustness refers to a model's ability to maintain consistent performance, and stability refers to a model's ability to produce consistent outputs over time across different data subsets.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " test_id=\"validmind.model_validation.sklearn.RobustnessDiagnosis:Champion_vs_LogRegression\",\n", + " input_grid={\n", + " \"datasets\": [[vm_train_ds,vm_test_ds]],\n", + " \"model\" : [vm_xgb_model,vm_log_model]\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Run feature importance tests\n", + "\n", + "We also want to verify the relative influence of different input features on our models' predictions, as well as inspect the differences between our champion and challenger model to see if a certain model offers more understandable or logical importance scores for features.\n", + "\n", + "Use `list_tests()` to identify all the feature importance tests for classification:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Store the feature importance tests\n", + "FI = vm.tests.list_tests(tags=[\"feature_importance\"], task=\"classification\",pretty=False)\n", + "FI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run and log our feature importance tests for both models for the testing dataset\n", + "for test in FI:\n", + " vm.tests.run_test(\n", + " \"\".join((test,':Champion_vs_LogisticRegression')),\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds], \"model\" : [vm_xgb_model,vm_log_model]\n", + " },\n", + " ).log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Implement a custom test\n", + "\n", + "Let's finish up testing by implementing a custom *inline test* that outputs a FICO score-type score. An inline test refers to a test written and executed within the same environment as the code being tested — in this case, right in this Jupyter Notebook — without requiring a separate test file or framework.\n", + "\n", + "The [`@vm.test` wrapper](https://docs.validmind.ai/validmind/validmind.html#test) allows you to create a reusable test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "\n", + "@vm.test(\"my_custom_tests.ScoreToOdds\")\n", + "def score_to_odds_analysis(dataset, score_column='score', score_bands=[410, 440, 470]):\n", + " \"\"\"\n", + " Analyzes the relationship between score bands and odds (good:bad ratio).\n", + " Good odds = (1 - default_rate) / default_rate\n", + " \n", + " Higher scores should correspond to higher odds of being good.\n", + "\n", + " If there are multiple scores provided through score_column, this means that there are two different models and the scores reflect each model\n", + "\n", + " If there are more scores provided in the score_column then focus the assessment on the differences between the two scores and indicate through evidence which one is preferred.\n", + " \"\"\"\n", + " df = dataset.df\n", + " \n", + " # Create score bands\n", + " df['score_band'] = pd.cut(\n", + " df[score_column],\n", + " bins=[-np.inf] + score_bands + [np.inf],\n", + " labels=[f'<{score_bands[0]}'] + \n", + " [f'{score_bands[i]}-{score_bands[i+1]}' for i in range(len(score_bands)-1)] +\n", + " [f'>{score_bands[-1]}']\n", + " )\n", + " \n", + " # Calculate metrics per band\n", + " results = df.groupby('score_band').agg({\n", + " dataset.target_column: ['mean', 'count']\n", + " })\n", + " \n", + " results.columns = ['Default Rate', 'Total']\n", + " results['Good Count'] = results['Total'] - (results['Default Rate'] * results['Total'])\n", + " results['Bad Count'] = results['Default Rate'] * results['Total']\n", + " results['Odds'] = results['Good Count'] / results['Bad Count']\n", + " \n", + " # Create visualization\n", + " fig = go.Figure()\n", + " \n", + " # Add odds bars\n", + " fig.add_trace(go.Bar(\n", + " name='Odds (Good:Bad)',\n", + " x=results.index,\n", + " y=results['Odds'],\n", + " marker_color='blue'\n", + " ))\n", + " \n", + " fig.update_layout(\n", + " title='Score-to-Odds Analysis',\n", + " yaxis=dict(title='Odds Ratio (Good:Bad)'),\n", + " showlegend=False\n", + " )\n", + " \n", + " return fig" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the custom test available, run and log the test for our champion and challenger models with our testing dataset (`vm_test_ds`):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = vm.tests.run_test(\n", + " \"my_custom_tests.ScoreToOdds:Champion_vs_Challenger\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " param_grid={\n", + " \"score_column\": [\"xgb_scores\",\"log_scores\"],\n", + " \"score_bands\": [[500, 540, 570]],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Want to learn more about custom tests?\n", + "

\n", + "Refer to our in-depth introduction to custom tests: Implement custom tests
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Verify test runs\n", + "\n", + "Our final task is to verify that all the tests provided by the model development team were run and reported accurately. Note the appended `result_ids` to delineate which dataset we ran the test with for the relevant tests.\n", + "\n", + "Here, we'll specify all the tests we'd like to independently rerun in a dictionary called `test_config`. **Note here that `inputs` and `input_grid` expect the `input_id` of the dataset or model as the value rather than the variable name we specified**:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_config = {\n", + " # Run with the raw dataset\n", + " 'validmind.data_validation.DatasetDescription:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'}\n", + " },\n", + " 'validmind.data_validation.DescriptiveStatistics:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'}\n", + " },\n", + " 'validmind.data_validation.MissingValues:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'min_threshold': 1}\n", + " },\n", + " 'validmind.data_validation.ClassImbalance:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'min_percent_threshold': 10}\n", + " },\n", + " 'validmind.data_validation.Duplicates:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'min_threshold': 1}\n", + " },\n", + " 'validmind.data_validation.HighCardinality:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {\n", + " 'num_threshold': 100,\n", + " 'percent_threshold': 0.1,\n", + " 'threshold_type': 'percent'\n", + " }\n", + " },\n", + " 'validmind.data_validation.Skewness:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'max_threshold': 1}\n", + " },\n", + " 'validmind.data_validation.UniqueRows:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'min_percent_threshold': 1}\n", + " },\n", + " 'validmind.data_validation.TooManyZeroValues:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'max_percent_threshold': 0.03}\n", + " },\n", + " 'validmind.data_validation.IQROutliersTable:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'threshold': 5}\n", + " },\n", + " # Run with the preprocessed dataset\n", + " 'validmind.data_validation.DescriptiveStatistics:preprocessed_data': {\n", + " 'inputs': {'dataset': 'preprocess_dataset'}\n", + " },\n", + " 'validmind.data_validation.TabularDescriptionTables:preprocessed_data': {\n", + " 'inputs': {'dataset': 'preprocess_dataset'}\n", + " },\n", + " 'validmind.data_validation.MissingValues:preprocessed_data': {\n", + " 'inputs': {'dataset': 'preprocess_dataset'},\n", + " 'params': {'min_threshold': 1}\n", + " },\n", + " 'validmind.data_validation.TabularNumericalHistograms:preprocessed_data': {\n", + " 'inputs': {'dataset': 'preprocess_dataset'}\n", + " },\n", + " 'validmind.data_validation.TabularCategoricalBarPlots:preprocessed_data': {\n", + " 'inputs': {'dataset': 'preprocess_dataset'}\n", + " },\n", + " 'validmind.data_validation.TargetRateBarPlots:preprocessed_data': {\n", + " 'inputs': {'dataset': 'preprocess_dataset'},\n", + " 'params': {'default_column': 'loan_status'}\n", + " },\n", + " # Run with the training and test datasets\n", + " 'validmind.data_validation.DescriptiveStatistics:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset', 'test_dataset']}\n", + " },\n", + " 'validmind.data_validation.TabularDescriptionTables:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset', 'test_dataset']}\n", + " },\n", + " 'validmind.data_validation.ClassImbalance:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset', 'test_dataset']},\n", + " 'params': {'min_percent_threshold': 10}\n", + " },\n", + " 'validmind.data_validation.UniqueRows:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset', 'test_dataset']},\n", + " 'params': {'min_percent_threshold': 1}\n", + " },\n", + " 'validmind.data_validation.TabularNumericalHistograms:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset', 'test_dataset']}\n", + " },\n", + " 'validmind.data_validation.MutualInformation:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset', 'test_dataset']},\n", + " 'params': {'min_threshold': 0.01}\n", + " },\n", + " 'validmind.data_validation.PearsonCorrelationMatrix:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset', 'test_dataset']}\n", + " },\n", + " 'validmind.data_validation.HighPearsonCorrelation:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset', 'test_dataset']},\n", + " 'params': {'max_threshold': 0.3, 'top_n_correlations': 10}\n", + " },\n", + " 'validmind.model_validation.ModelMetadata': {\n", + " 'input_grid': {'model': ['xgb_model_developer_champion', 'rf_model']}\n", + " },\n", + " 'validmind.model_validation.sklearn.ModelParameters': {\n", + " 'input_grid': {'model': ['xgb_model_developer_champion', 'rf_model']}\n", + " },\n", + " 'validmind.model_validation.sklearn.ROCCurve': {\n", + " 'input_grid': {'dataset': ['train_dataset', 'test_dataset'], 'model': ['xgb_model_developer_champion']}\n", + " },\n", + " 'validmind.model_validation.sklearn.MinimumROCAUCScore': {\n", + " 'input_grid': {'dataset': ['train_dataset', 'test_dataset'], 'model': ['xgb_model_developer_champion']},\n", + " 'params': {'min_threshold': 0.5}\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then batch run and log our tests in `test_config`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for t in test_config:\n", + " print(t)\n", + " try:\n", + " # Check if test has input_grid\n", + " if 'input_grid' in test_config[t]:\n", + " # For tests with input_grid, pass the input_grid configuration\n", + " if 'params' in test_config[t]:\n", + " vm.tests.run_test(t, input_grid=test_config[t]['input_grid'], params=test_config[t]['params']).log()\n", + " else:\n", + " vm.tests.run_test(t, input_grid=test_config[t]['input_grid']).log()\n", + " else:\n", + " # Original logic for regular inputs\n", + " if 'params' in test_config[t]:\n", + " vm.tests.run_test(t, inputs=test_config[t]['inputs'], params=test_config[t]['params']).log()\n", + " else:\n", + " vm.tests.run_test(t, inputs=test_config[t]['inputs']).log()\n", + " except Exception as e:\n", + " print(f\"Error running test {t}: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Work with your validation report\n", + "\n", + "Now that you've logged all your test results and verified the work done by the model development team, head to the ValidMind Platform to wrap up your validation report:\n", + "\n", + "1. From the **Inventory** in the ValidMind Platform, go to the model you connected to earlier.\n", + "\n", + "2. In the left sidebar that appears for your model, click **Validation Report**.\n", + "\n", + "Include your logged test results as evidence, create risk assessment notes, add findings, and assess compliance, then submit your report for review when it's ready. **Learn more:** [Preparing validation reports](https://docs.validmind.ai/guide/model-validation/preparing-validation-reports.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Discover more learning resources\n", + "\n", + "All notebook samples can be found in the following directories of the ValidMind Library GitHub repository:\n", + "\n", + "- [Code samples](https://github.com/validmind/validmind-library/tree/main/notebooks/code_samples)\n", + "- [How-to guides](https://github.com/validmind/validmind-library/tree/main/notebooks/how_to)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ValidMind Library", + "language": "python", + "name": "validmind" + }, + "language_info": { + "name": "python", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/code_samples/model_validation/xgb_model_champion.pkl b/site/notebooks/code_samples/model_validation/xgb_model_champion.pkl new file mode 100644 index 0000000000..bfe7349b67 Binary files /dev/null and b/site/notebooks/code_samples/model_validation/xgb_model_champion.pkl differ diff --git a/site/notebooks/code_samples/nlp_and_llm/prompt_validation_demo.ipynb b/site/notebooks/code_samples/nlp_and_llm/prompt_validation_demo.ipynb index 0f88228e1e..055028eca4 100644 --- a/site/notebooks/code_samples/nlp_and_llm/prompt_validation_demo.ipynb +++ b/site/notebooks/code_samples/nlp_and_llm/prompt_validation_demo.ipynb @@ -66,7 +66,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/code_samples/nlp_and_llm/rag_benchmark_demo.ipynb b/site/notebooks/code_samples/nlp_and_llm/rag_benchmark_demo.ipynb new file mode 100644 index 0000000000..329092a4b5 --- /dev/null +++ b/site/notebooks/code_samples/nlp_and_llm/rag_benchmark_demo.ipynb @@ -0,0 +1,1635 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAG Model Benchmarking Demo\n", + "\n", + "In this notebook, we are going to implement a simple RAG Model for automating the process of answering RFP questions using GenAI. We will see how we can initialize an embedding model, a retrieval model and a generator model with LangChain components and use them within the ValidMind Library to run tests against them. We'll demonstrate how to set up multiple models for benchmarking at each stage of the RAG pipeline - specifically two embedding models, two retrieval models with different parameters, and two LLM models (GPT-3.5 and GPT-4o) - allowing for comparison of performance across different configurations. Finally, we will see how we can put them together in a Pipeline and run that to get e2e results and run tests against that." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, create a free ValidMind account.\n", + "

\n", + "Signing up is FREE — Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "- **FunctionModels**: ValidMind offers support for creating `VMModel` instances from Python functions. This enables us to support any \"model\" by simply using the provided function as the model's `predict` method.\n", + "- **PipelineModels**: ValidMind models (`VMModel` instances) of any type can be piped together to create a model pipeline. This allows model components to be created and tested/documented independently, and then combined into a single model for end-to-end testing and documentation. We use the `|` operator to pipe models together.\n", + "- **RAG**: RAG stands for Retrieval Augmented Generation and refers to a wide range of GenAI applications where some form of retrieval is used to add context to the prompt so that the LLM that generates content can refer to it when creating its output. In this notebook, we are going to implement a simple RAG setup using LangChain components.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prerequisites\n", + "\n", + "Let's go ahead and install the `validmind` library if its not already installed... Then we can install the `qdrant-client` library for our vector store and `langchain` for everything else:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q \"validmind[llm]\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q qdrant-client langchain langchain-openai sentencepiece" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize the ValidMind Library\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "### Get your code snippet\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Model Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + " For example, to register a model for use with this notebook, select:\n", + "\n", + " - Documentation template: `Gen AI RAG Template`\n", + " - Use case: `Marketing/Sales - Analytics`\n", + "\n", + " You can fill in other options according to your preference.\n", + "\n", + "4. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", + " api_key = \"...\",\n", + " api_secret = \"...\",\n", + " model = \"...\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read Open AI API Key\n", + "\n", + "We will need to have an OpenAI API key to be able to use their `text-embedding-3-small` and `text-embedding-3-large` models for our embeddings, `gpt-3.5-turbo` and `gpt-4o` models for our generator and `gpt-4o` model for our LLM-as-Judge tests. If you don't have an OpenAI API key, you can get one by signing up at [OpenAI](https://platform.openai.com/signup). Then you can create a `.env` file in the root of your project and the following cell will load it from there. Alternatively, you can just uncomment the line below to directly set the key (not recommended for security reasons)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load openai api key\n", + "import os\n", + "\n", + "import dotenv\n", + "import nltk\n", + "\n", + "dotenv.load_dotenv()\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt_tab')\n", + "\n", + "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", + "\n", + "if not \"OPENAI_API_KEY\" in os.environ:\n", + " raise ValueError(\"OPENAI_API_KEY is not set\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dataset Loader\n", + "\n", + "Great, now that we have all of our dependencies installed, the ValidMind Library initialized and connected to our model and our OpenAI API key setup, we can go ahead and load our datasets. We will use the synthetic `RFP` dataset included with ValidMind for this notebook. This dataset contains a variety of RFP questions and ground truth answers that we can use both as the source where our Retriever will search for similar question-answer pairs as well as our test set for evaluating the performance of our RAG model. To do this, we just have to load it and call the preprocess function to get a split of the data into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the sample dataset from the library\n", + "from validmind.datasets.llm.rag import rfp\n", + "\n", + "raw_df = rfp.load_data()\n", + "train_df, test_df = rfp.preprocess(raw_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds = vm.init_dataset(\n", + " train_df,\n", + " text_column=\"question\",\n", + " target_column=\"ground_truth\",\n", + ")\n", + "\n", + "vm_test_ds = vm.init_dataset(\n", + " test_df,\n", + " text_column=\"question\",\n", + " target_column=\"ground_truth\",\n", + ")\n", + "\n", + "vm_test_ds.df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data validation\n", + "\n", + "Now that we have loaded our dataset, we can go ahead and run some data validation tests right away to start assessing and documenting the quality of our data. Since we are using a text dataset, we can use ValidMind's built-in array of text data quality tests to check that things like number of duplicates, missing values, and other common text data issues are not present in our dataset. We can also run some tests to check the sentiment and toxicity of our data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Duplicates\n", + "\n", + "First, let's check for duplicates in our dataset. We can use the `validmind.data_validation.Duplicates` test and pass our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import run_test\n", + "\n", + "run_test(\n", + " test_id=\"validmind.data_validation.Duplicates\",\n", + " inputs={\"dataset\": vm_train_ds},\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stop Words\n", + "\n", + "Next, let's check for stop words in our dataset. We can use the `validmind.data_validation.StopWords` test and pass our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.StopWords\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Punctuations\n", + "\n", + "Next, let's check for punctuations in our dataset. We can use the `validmind.data_validation.Punctuations` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.Punctuations\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Common Words\n", + "\n", + "Next, let's check for common words in our dataset. We can use the `validmind.data_validation.CommonWord` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.CommonWords\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Language Detection\n", + "\n", + "For documentation purposes, we can detect and log the languages used in the dataset with the `validmind.data_validation.LanguageDetection` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.LanguageDetection\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Toxicity Score\n", + "\n", + "Now, let's go ahead and run the `validmind.data_validation.nlp.Toxicity` test to compute a toxicity score for our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.Toxicity\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Polarity and Subjectivity\n", + "\n", + "We can also run the `validmind.data_validation.nlp.PolarityAndSubjectivity` test to compute the polarity and subjectivity of our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.PolarityAndSubjectivity\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sentiment\n", + "\n", + "Finally, we can run the `validmind.data_validation.nlp.Sentiment` test to plot the sentiment of our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.Sentiment\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Embedding Model\n", + "\n", + "Now that we have our dataset loaded and have run some data validation tests to assess and document the quality of our data, we can go ahead and initialize our embedding model. We will use `text-embedding-3-small` and `text-embedding-3-large` models from OpenAI for this purpose wrapped in the `OpenAIEmbeddings` class from LangChain. This model will be used to \"embed\" our questions both for inserting the question-answer pairs from the \"train\" set into the vector store and for embedding the question from inputs when making predictions with our RAG model." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embedding_small_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", + "\n", + "\n", + "def embed_small(input):\n", + " \"\"\"Returns a text embedding for the given text\"\"\"\n", + " return embedding_small_client.embed_query(input[\"question\"])\n", + "\n", + "\n", + "vm_embedder_small = vm.init_model(input_id=\"embedding_small_model\", predict_fn=embed_small)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_large_client = OpenAIEmbeddings(model=\"text-embedding-3-large\")\n", + "\n", + "\n", + "def embed_large(input):\n", + " \"\"\"Returns a text embedding for the given text\"\"\"\n", + " return embedding_large_client.embed_query(input[\"question\"])\n", + "\n", + "\n", + "vm_embedder_large = vm.init_model(input_id=\"embedding_large_model\", predict_fn=embed_large)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What we have done here is to initialize the `OpenAIEmbeddings` class so it uses OpenAI's `text-embedding-3-small` and `text-embedding-3-large` models. We then created an `embed` function that takes in an `input` dictionary and uses the `embed_query` method of the embedding client to compute the embeddings of the `question`. We use an `embed` function since that is how ValidMind supports any custom model. We will use this strategy for the retrieval and generator models as well but you could also use, say, a HuggingFace model directly. See the documentation for more information on which model types are directly supported - [ValidMind Documentation](https://docs.validmind.ai/validmind/validmind.html)... Finally, we use the `init_model` function from the ValidMind Library to create a `VMModel` object that can be used in ValidMind tests. This also logs the model to our model documentation and any test that uses the model will be linked to the logged model and its metadata." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Assign Predictions\n", + "\n", + "To precompute the embeddings for our test set, we can call the `assign_predictions` method of our `vm_test_ds` object we created above. This will compute the embeddings for each question in the test set and store them in the a special prediction column of the test set thats linked to our `vm_embedder` model. This will allow us to use these embeddings later when we run tests against our embedding model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(vm_embedder_small)\n", + "vm_test_ds.assign_predictions(vm_embedder_large)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run tests\n", + "\n", + "Now that everything is setup for the embedding model, we can go ahead and run some tests to assess and document the quality of our embeddings. We will use the `validmind.model_validation.embeddings.*` tests to compute a variety of metrics against our model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisRandomNoise\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + " params={\n", + " \"probability\": 0.3,\n", + " \"mean_similarity_threshold\": 0.7,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisSynonyms\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + " params={\n", + " \"probability\": 0.3,\n", + " \"mean_similarity_threshold\": 0.7,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisTranslation\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + " params={\n", + " \"source_lang\": \"en\",\n", + " \"target_lang\": \"fr\",\n", + " \"mean_similarity_threshold\": 0.7,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.CosineSimilarityHeatmap\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.CosineSimilarityDistribution\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.PCAComponentsPairwisePlots\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + " params={\n", + " \"n_components\": 3,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup Vector Store\n", + "\n", + "Great, so now that we have assessed our embedding model and verified that it is performing well, we can go ahead and use it to compute embeddings for our question-answer pairs in the \"train\" set. We will then use these embeddings to insert the question-answer pairs into a vector store. We will use an in-memory `qdrant` vector database for demo purposes but any option would work just as well here. We will use the `QdrantClient` class from LangChain to interact with the vector store. This class will allow us to insert and search for embeddings in the vector store." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate embeddings for the Train Set\n", + "\n", + "We can use the same `assign_predictions` method from earlier except this time we will use the `vm_train_ds` object to compute the embeddings for the question-answer pairs in the \"train\" set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds.assign_predictions(vm_embedder_small)\n", + "print(vm_train_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Insert embeddings and questions into Vector DB\n", + "\n", + "Now that we have computed the embeddings for our question-answer pairs in the \"train\" set, we can go ahead and insert them into the vector store:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.vectorstores import Qdrant\n", + "from langchain_community.document_loaders import DataFrameLoader\n", + "\n", + "# load documents from dataframe\n", + "loader = DataFrameLoader(train_df, page_content_column=\"question\")\n", + "docs = loader.load()\n", + "\n", + "# setup vector datastore\n", + "qdrant = Qdrant.from_documents(\n", + " docs,\n", + " embedding_small_client,\n", + " location=\":memory:\", # Local mode with in-memory storage only\n", + " collection_name=\"rfp_rag_collection\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Retrieval Model\n", + "\n", + "Now that we have an embedding model and a vector database setup and loaded with our data, we need a Retrieval model that can search for similar question-answer pairs for a given input question. Once created, we can initialize this as a ValidMind model and `assign_predictions` to it just like our embedding model. In this example, we'll create two retrieval models with different `k` parameters (the number of documents retrieved) to benchmark and compare their performance. This approach allows us to evaluate how retrieval depth affects the overall system quality." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def retrieve(input):\n", + " contexts = []\n", + "\n", + " for result in qdrant.similarity_search_with_score(input[\"question\"], k=5):\n", + " document, score = result\n", + " context = f\"Q: {document.page_content}\\n\"\n", + " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", + "\n", + " contexts.append(context)\n", + "\n", + " return contexts\n", + "\n", + "\n", + "vm_retriever_k5 = vm.init_model(input_id=\"retrieval_k5_model\", predict_fn=retrieve)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def retrieve(input):\n", + " contexts = []\n", + "\n", + " for result in qdrant.similarity_search_with_score(input[\"question\"], k=10):\n", + " document, score = result\n", + " context = f\"Q: {document.page_content}\\n\"\n", + " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", + "\n", + " contexts.append(context)\n", + "\n", + " return contexts\n", + "\n", + "\n", + "vm_retriever_k10 = vm.init_model(input_id=\"retrieval_k10_model\", predict_fn=retrieve)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(model=vm_retriever_k5)\n", + "vm_test_ds.assign_predictions(model=vm_retriever_k10)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds._df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generation Model\n", + "\n", + "As the final piece of this simple RAG pipeline, we can create and initialize a generation model that will use the retrieved context to generate an answer to the input question. We will use the `gpt-3.5-turbo` and `gpt-4o` models from OpenAI. Since we have two retrieval models (with different `k` values) and want to test two different LLMs, we'll create a total of four generator models - pairing each retrieval configuration with each LLM to comprehensively evaluate how both retrieval depth and model capability affect response quality." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI\n", + "\n", + "from validmind.models import Prompt\n", + "\n", + "\n", + "system_prompt = \"\"\"\n", + "You are an expert RFP AI assistant.\n", + "You are tasked with answering new RFP questions based on existing RFP questions and answers.\n", + "You will be provided with the existing RFP questions and answer pairs that are the most relevant to the new RFP question.\n", + "After that you will be provided with a new RFP question.\n", + "You will generate an answer and respond only with the answer.\n", + "Ignore your pre-existing knowledge and answer the question based on the provided context.\n", + "\"\"\".strip()\n", + "\n", + "openai_client = OpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(input):\n", + " \n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k5_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + " \n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator_k5_gpt35 = vm.init_model(\n", + " input_id=\"generation_k5_gpt35_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(input):\n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k10_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + "\n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator_k10_gpt35 = vm.init_model(\n", + " input_id=\"generation_k10_gpt35_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(input):\n", + " \n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-4o\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k5_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + " \n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator_k5_gpt4o = vm.init_model(\n", + " input_id=\"generation_k5_gpt4o_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(input):\n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-4o\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k10_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + "\n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator_k10_gpt4o = vm.init_model(\n", + " input_id=\"generation_k10_gpt4o_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's test it out real quick:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "vm_generator_k5_gpt35.predict(\n", + " pd.DataFrame(\n", + " {\"retrieval_k5_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_generator_k5_gpt4o.predict(\n", + " pd.DataFrame(\n", + " {\"retrieval_k5_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prompt Evaluation\n", + "\n", + "Now that we have our generator model initialized, we can run some LLM-as-Judge tests to evaluate the system prompt. This will allow us to get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", + "\n", + "- **Examplar Bias**: When using multi-shot prompting, does the prompt contain an unbiased distribution of examples?\n", + "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", + "- **Clarity**: How clearly the prompt states the task.\n", + "- **Conciseness**: How succinctly the prompt states the task.\n", + "- **Instruction Framing**: Whether the prompt contains negative instructions.\n", + "- **Specificity**: How specific the prompt defines the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Bias\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Clarity\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Conciseness\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Delimitation\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.NegativeInstruction\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Specificity\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup RAG Pipeline Model\n", + "\n", + "Now that we have all of our individual \"component\" models setup and initialized we need some way to put them all together in a single \"pipeline\". We can use the `PipelineModel` class to do this. This ValidMind model type simply wraps any number of other ValidMind models and runs them in sequence. We can use a pipe(`|`) operator - in Python this is normally an `or` operator but we have overloaded it for easy pipeline creation - to chain together our models. We can then initialize this pipeline model and assign predictions to it just like any other model." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "vm_rag_k5_gpt35_model = vm.init_model(vm_retriever_k5 | vm_generator_k5_gpt35, input_id=\"rag_k5_gpt35_model\")\n", + "vm_rag_k10_gpt35_model = vm.init_model(vm_retriever_k10 | vm_generator_k10_gpt35, input_id=\"rag_k10_gpt35_model\")\n", + "vm_rag_k5_gpt4o_model = vm.init_model(vm_retriever_k5 | vm_generator_k5_gpt4o, input_id=\"rag_k5_gpt4o_model\")\n", + "vm_rag_k10_gpt4o_model = vm.init_model(vm_retriever_k10 | vm_generator_k10_gpt4o, input_id=\"rag_k10_gpt4o_model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can `assign_predictions` to the pipeline model just like we did with the individual models. This will run the pipeline on the test set and store the results in the test set for later use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(model=vm_rag_k5_gpt35_model)\n", + "vm_test_ds.assign_predictions(model=vm_rag_k10_gpt35_model)\n", + "vm_test_ds.assign_predictions(model=vm_rag_k5_gpt4o_model)\n", + "vm_test_ds.assign_predictions(model=vm_rag_k10_gpt4o_model)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds._df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run tests\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RAGAS evaluation\n", + "\n", + "Let's go ahead and run some of our new RAG tests against our model...\n", + "\n", + "> Note: these tests are still being developed and are not yet in a stable state. We are using advanced tests here that use LLM-as-Judge and other strategies to assess things like the relevancy of the retrieved context to the input question and the correctness of the generated answer when compared to the ground truth. There is more to come in this area so stay tuned!" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Semantic Similarity\n", + "\n", + "The concept of Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated answer and the ground truth. This evaluation is based on the ground truth and the answer, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated answer and the ground truth.\n", + "\n", + "Measuring the semantic similarity between answers can offer valuable insights into the quality of the generated response. This evaluation utilizes a cross-encoder model to calculate the semantic similarity score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.SemanticSimilarity\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Context Entity Recall\n", + "\n", + "This test gives the measure of recall of the retrieved context, based on the number of entities present in both ground_truths and contexts relative to the number of entities present in the ground_truths alone. Simply put, it is a measure of what fraction of entities are recalled from ground_truths. This test is useful in fact-based use cases like tourism help desk, historical QA, etc. This test can help evaluate the retrieval mechanism for entities, based on comparison with entities present in ground_truths, because in cases where entities matter, we need the contexts which cover them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextEntityRecall\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"reference_column\": [\"ground_truth\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Context Precision\n", + "\n", + "Context Precision is a test that evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This test is computed using the question, ground_truth and the contexts, with values ranging between 0 and 1, where higher scores indicate better precision." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecision\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Context Precision Without Reference\n", + "\n", + "This test evaluates whether retrieved contexts align well with the expected response for a given user input, without requiring a ground-truth reference. This test assesses the relevance of each retrieved context chunk by comparing it directly to the response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid=[\n", + " {\"user_input_column\": \"question\",\n", + " \"retrieved_contexts_column\": \"retrieval_k5_model_prediction\",\n", + " \"response_column\": \"rag_k5_gpt4o_model_prediction\"\n", + " },\n", + " {\"user_input_column\": \"question\",\n", + " \"retrieved_contexts_column\": \"retrieval_k10_model_prediction\",\n", + " \"response_column\": \"rag_k10_gpt4o_model_prediction\"\n", + " },\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Faithfulness\n", + "\n", + "This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. The answer is scaled to (0,1) range. Higher the better.\n", + "\n", + "The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.Faithfulness\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Response Relevancy\n", + "\n", + "The Response Relevancy test, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information and higher scores indicate better relevancy. This test is computed using the question, the context and the answer.\n", + "\n", + "The Response Relevancy is defined as the mean cosine similartiy of the original question to a number of artifical questions, which where generated (reverse engineered) based on the answer.\n", + "\n", + "Please note, that eventhough in practice the score will range between 0 and 1 most of the time, this is not mathematically guranteed, due to the nature of the cosine similarity ranging from -1 to 1.\n", + "\n", + "> Note: This is a reference free test. If you’re looking to compare ground truth answer with generated answer refer to Answer Correctness.\n", + "\n", + "An answer is deemed relevant when it directly and appropriately addresses the original question. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the answer lacks completeness or contains redundant details. To calculate this score, the LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured. The underlying idea is that if the generated answer accurately addresses the initial question, the LLM should be able to generate questions from the answer that align with the original question." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Context Recall\n", + "\n", + "Context recall measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. It is computed based on the ground truth and the retrieved context, and the values range between 0 and 1, with higher values indicating better performance.\n", + "\n", + "To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextRecall\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Answer Correctness\n", + "\n", + "The assessment of Answer Correctness involves gauging the accuracy of the generated answer when compared to the ground truth. This evaluation relies on the ground truth and the answer, with scores ranging from 0 to 1. A higher score indicates a closer alignment between the generated answer and the ground truth, signifying better correctness.\n", + "\n", + "Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score.\n", + "\n", + "Factual correctness quantifies the factual overlap between the generated answer and the ground truth answer. This is done using the concepts of:\n", + "\n", + "- TP (True Positive): Facts or statements that are present in both the ground truth and the generated answer.\n", + "- FP (False Positive): Facts or statements that are present in the generated answer but not in the ground truth.\n", + "- FN (False Negative): Facts or statements that are present in the ground truth but not in the generated answer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.AnswerCorrectness\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Aspect Critic\n", + "\n", + "This is designed to assess submissions based on predefined aspects such as harmlessness and correctness. Additionally, users have the flexibility to define their own aspects for evaluating submissions according to their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not. This evaluation is performed using the ‘answer’ as input.\n", + "\n", + "Critiques within the LLM evaluators evaluate submissions based on the provided aspect. Ragas Critiques offers a range of predefined aspects like correctness, harmfulness, etc. Users can also define their own aspects for evaluating submissions based on their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.AspectCritic\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Noise Sensitivity\n", + "\n", + "This test is designed to evaluate the robustness of the RAG pipeline model against noise in the retrieved context. It works by checking how well the \"claims\" in the generated answer match up with the \"claims\" in the ground truth answer. If the generated answer contains \"claims\" from the contexts that the ground truth answer does not contain, those claims are considered incorrect. The score for each answer is the number of incorrect claims divided by the total number of claims. This *can* be interpreted as a measure of how sensitive the LLM is to \"noise\" in the context where \"noise\" is information that is relevant but should not be included in the answer since the ground truth answer does not contain it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.NoiseSensitivity\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generation quality\n", + "\n", + "In this section, we evaluate the alignment and relevance of generated responses to reference outputs within our retrieval-augmented generation (RAG) application. We use metrics that assess various quality dimensions of the generated responses, including semantic similarity, structural alignment, and phrasing overlap. Semantic similarity metrics compare embeddings of generated and reference text to capture deeper contextual alignment, while overlap and alignment measures quantify how well the phrasing and structure of generated responses match the intended outputs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Token Disparity\n", + "\n", + "This test assesses the difference in token counts between the reference texts (ground truth) and the answers generated by the RAG model. It helps evaluate how well the model's outputs align with the expected length and level of detail in the reference texts. A significant disparity in token counts could signal issues with generation quality, such as excessive verbosity or insufficient detail. Consistently low token counts in generated answers compared to references might suggest that the model’s outputs are incomplete or overly concise, missing important contextual information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.TokenDisparity\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ROUGE Score\n", + "\n", + "This test evaluates the quality of answers generated by the RAG model by measuring overlaps in n-grams, word sequences, and word pairs between the model output and the reference (ground truth) text. ROUGE, short for Recall-Oriented Understudy for Gisting Evaluation, assesses both precision and recall, providing a balanced view of how well the generated response captures the reference content. ROUGE precision measures the proportion of n-grams in the generated text that match the reference, highlighting relevance and conciseness, while ROUGE recall assesses the proportion of reference n-grams present in the generated text, indicating completeness and thoroughness. \n", + "\n", + "Low precision scores might reveal that the generated text includes redundant or irrelevant information, while low recall scores suggest omissions of essential details from the reference. Consistently low ROUGE scores could indicate poor overall alignment with the ground truth, suggesting the model may be missing key content or failing to capture the intended meaning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.RougeScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + " params={\n", + " \"metric\": \"rouge-1\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BLEU Score\n", + "\n", + "The BLEU Score test evaluates the quality of answers generated by the RAG model by measuring n-gram overlap between the generated text and the reference (ground truth) text, with a specific focus on exact precision in phrasing. While ROUGE precision also assesses overlap, BLEU differs in two main ways: first, it applies a geometric average across multiple n-gram levels, capturing precise phrase alignment, and second, it includes a brevity penalty to prevent overly short outputs from inflating scores artificially. This added precision focus is valuable in RAG applications where strict adherence to reference language is essential, as BLEU emphasizes the match to exact phrasing. In contrast, ROUGE precision evaluates general content overlap without penalizing brevity, offering a broader sense of content alignment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.BleuScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BERT Score\n", + "\n", + "This test evaluates the quality of the RAG generated answers using BERT embeddings to measure precision, recall, and F1 scores based on semantic similarity, rather than exact n-gram matches as in BLEU and ROUGE. This approach captures contextual meaning, making it valuable when wording differs but the intended message closely aligns with the reference. In RAG applications, the BERT score is especially useful for ensuring that generated answers convey the reference text’s meaning, even if phrasing varies. Consistently low scores indicate a lack of semantic alignment, suggesting the model may miss or misrepresent key content. Low precision may reflect irrelevant or redundant details, while low recall can indicate omissions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.BertScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### METEOR Score\n", + "\n", + "This test evaluates the quality of the generated answers by measuring alignment with the ground truth, emphasizing both accuracy and fluency. Unlike BLEU and ROUGE, which focus on n-gram matches, METEOR combines precision, recall, synonym matching, and word order, focusing at how well the generated text conveys meaning and reads naturally. This metric is especially useful for RAG applications where sentence structure and natural flow are crucial for clear communication. Lower scores may suggest alignment issues, indicating that the answers may lack fluency or key content. Discrepancies in word order or high fragmentation penalties can reveal problems with how the model constructs sentences, potentially affecting readability." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.MeteorScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bias and Toxicity\n", + "\n", + "In this section, we use metrics like Toxicity Score and Regard Score to evaluate both the generated responses and the ground truth. These tests helps us detect any harmful, offensive, or inappropriate language and evaluate the level of bias and neutrality enabling us to assess and mitigate potential biases in both the model's responses and the original dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Toxicity Score\n", + "\n", + "This test measures the level of harmful or offensive content in the generated answers. The test uses a preloaded toxicity detection tool from Hugging Face, which identifies language that may be inappropriate, aggressive, or derogatory. High toxicity scores indicate potentially toxic content, while consistently elevated scores across multiple outputs may signal underlying issues in the model’s generation process that require attention to prevent the spread of harmful language." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ToxicityScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Regard Score\n", + "\n", + "This test evaluates the sentiment and perceived regard—categorized as positive, negative, neutral, or other—in answers generated by the RAG model. This is important for identifying any biases or sentiment tendencies in responses, ensuring that generated answers are balanced and appropriate for the context. The uses a preloaded regard evaluation tool from Hugging Face to compute scores for each response. High skewness in regard scores, especially if the generated responses consistently diverge from expected sentiments in the reference texts, may reveal biases in the model’s generation, such as overly positive or negative tones where neutrality is expected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.RegardScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "In this notebook, we have seen how we can use LangChain and ValidMind together to build, evaluate and document a simple RAG Model as its developed. This is a great example of the interactive development experience that ValidMind is designed to support. We can quickly iterate on our model and document as we go... We have seen how ValidMind supports non-traditional \"models\" using a functional interface and how we can build pipelines of many models to support complex GenAI workflows.\n", + "\n", + "This is still a work in progress and we are actively developing new tests to support more advanced GenAI workflows. We are also keeping an eye on the most popular GenAI models and libraries to explore direct integrations. Stay tuned for more updates and new features in this area!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ValidMind Library", + "language": "python", + "name": "validmind" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/code_samples/nlp_and_llm/rag_documentation_demo.ipynb b/site/notebooks/code_samples/nlp_and_llm/rag_documentation_demo.ipynb index f6942033ef..dfc9ccc15d 100644 --- a/site/notebooks/code_samples/nlp_and_llm/rag_documentation_demo.ipynb +++ b/site/notebooks/code_samples/nlp_and_llm/rag_documentation_demo.ipynb @@ -33,7 +33,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb b/site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb index e4e48884d3..04ca7aa1c9 100644 --- a/site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb +++ b/site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb @@ -33,7 +33,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/code_samples/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb b/site/notebooks/code_samples/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb index 156f3fb142..5396ee1aa4 100644 --- a/site/notebooks/code_samples/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb +++ b/site/notebooks/code_samples/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb @@ -74,7 +74,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/code_samples/time_series/quickstart_time_series_full_suite.ipynb b/site/notebooks/code_samples/time_series/quickstart_time_series_full_suite.ipynb index edd3ca9b57..c674ffd424 100644 --- a/site/notebooks/code_samples/time_series/quickstart_time_series_full_suite.ipynb +++ b/site/notebooks/code_samples/time_series/quickstart_time_series_full_suite.ipynb @@ -77,7 +77,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/code_samples/time_series/quickstart_time_series_high_code.ipynb b/site/notebooks/code_samples/time_series/quickstart_time_series_high_code.ipynb index 8873b85243..8418dcd0a7 100644 --- a/site/notebooks/code_samples/time_series/quickstart_time_series_high_code.ipynb +++ b/site/notebooks/code_samples/time_series/quickstart_time_series_high_code.ipynb @@ -77,7 +77,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/how_to/add_context_to_llm_descriptions.ipynb b/site/notebooks/how_to/add_context_to_llm_descriptions.ipynb index 6951e52858..45736d2c7a 100644 --- a/site/notebooks/how_to/add_context_to_llm_descriptions.ipynb +++ b/site/notebooks/how_to/add_context_to_llm_descriptions.ipynb @@ -23,7 +23,7 @@ "- [Initialize the Python environment](#toc3_) \n", "- [Load the sample dataset](#toc4_) \n", " - [Preprocess the raw dataset](#toc4_1_) \n", - "- [Initialize the ValidMind objects](#toc5_) \n", + "- [Initializing the ValidMind objects](#toc5_) \n", " - [Initialize the datasets](#toc5_1_) \n", " - [Initialize a model object](#toc5_2_) \n", " - [Assign predictions to the datasets](#toc5_3_) \n", @@ -230,7 +230,7 @@ "source": [ "\n", "\n", - "## Initialize the ValidMind objects" + "## Initializing the ValidMind objects" ] }, { @@ -349,7 +349,7 @@ "\n", "By default, custom context for LLM-generated descriptions is disabled, meaning that the output will not include any additional context.\n", "\n", - "Let's generate an initial test description for the `DatasetDescription` test for comparision with later iterations:" + "Let's generate an initial test description for the `DatasetDescription` test for comparison with later iterations:" ] }, { @@ -376,7 +376,7 @@ "\n", "To enable custom use case context, set the `VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED` environment variable to `1`.\n", "\n", - "This is a global setting that will affect all tests for your linked model:" + "This is a global setting that will affect all tests for your linked model for the duration of your ValidMind Library session:" ] }, { @@ -431,7 +431,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "With the use case context set, generate an updated test description for the `DatasetDescription` test for comparision with default output:" + "With the use case context set, generate an updated test description for the `DatasetDescription` test for comparison with default output:" ] }, { @@ -458,7 +458,7 @@ "\n", "To disable custom use case context, set the `VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED` environment variable to `0`.\n", "\n", - "This is a global setting that will affect all tests for your linked model:" + "This is a global setting that will affect all tests for your linked model for the duration of your ValidMind Library session:" ] }, { @@ -474,7 +474,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "With the use case context disabled again, generate another test description for the `DatasetDescription` test for comparision with previous custom output:" + "With the use case context disabled again, generate another test description for the `DatasetDescription` test for comparison with previous custom output:" ] }, { diff --git a/site/notebooks/how_to/configure_dataset_features.ipynb b/site/notebooks/how_to/configure_dataset_features.ipynb index 9bf927740b..2e381bd895 100644 --- a/site/notebooks/how_to/configure_dataset_features.ipynb +++ b/site/notebooks/how_to/configure_dataset_features.ipynb @@ -67,7 +67,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/how_to/document_multiple_results_for_the_same_test.ipynb b/site/notebooks/how_to/document_multiple_results_for_the_same_test.ipynb index 8dc4ab10df..2529727af2 100644 --- a/site/notebooks/how_to/document_multiple_results_for_the_same_test.ipynb +++ b/site/notebooks/how_to/document_multiple_results_for_the_same_test.ipynb @@ -82,7 +82,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/how_to/explore_test_suites.ipynb b/site/notebooks/how_to/explore_test_suites.ipynb index 7cb5e2e49f..4fb36d894b 100644 --- a/site/notebooks/how_to/explore_test_suites.ipynb +++ b/site/notebooks/how_to/explore_test_suites.ipynb @@ -63,7 +63,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/how_to/explore_tests.ipynb b/site/notebooks/how_to/explore_tests.ipynb index 9a60f9c08b..672c98fc33 100644 --- a/site/notebooks/how_to/explore_tests.ipynb +++ b/site/notebooks/how_to/explore_tests.ipynb @@ -81,1191 +81,1786 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
IDNameDescriptionRequired InputsParamsIDNameDescriptionRequired InputsParamsTagsTasks
validmind.prompt_validation.BiasBiasEvaluates bias in a Large Language Model based on the order and distribution of exemplars in a prompt....['model.prompt']{'min_threshold': 7}
validmind.prompt_validation.ClarityClarityEvaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines....['model.prompt']{'min_threshold': 7}
validmind.prompt_validation.SpecificitySpecificityEvaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity,...['model.prompt']{'min_threshold': 7}
validmind.prompt_validation.RobustnessRobustnessAssesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts....['model']{'num_tests': 10}
validmind.prompt_validation.NegativeInstructionNegative InstructionEvaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts....['model.prompt']{'min_threshold': 7}
validmind.prompt_validation.ConcisenessConcisenessAnalyzes and grades the conciseness of prompts provided to a Large Language Model....['model.prompt']{'min_threshold': 7}
validmind.prompt_validation.DelimitationDelimitationEvaluates the proper use of delimiters in prompts provided to Large Language Models....['model.prompt']{'min_threshold': 7}
validmind.model_validation.ModelPredictionResidualsModel Prediction ResidualsPlot the residuals and histograms for each model, and generate a summary table...['datasets', 'models']{'nbins': 100, 'p_value_threshold': 0.05, 'start_date': None, 'end_date': None}
validmind.model_validation.BertScoreBert ScoreEvaluates the quality of machine-generated text using BERTScore metrics and visualizes the results through histograms...['dataset', 'model']{}
validmind.model_validation.TimeSeriesPredictionsPlotTime Series Predictions PlotPlot actual vs predicted values for time series data and generate a visual comparison for each model....['datasets', 'models']{}
validmind.model_validation.RegardScoreRegard ScoreComputes and visualizes the regard score for each text instance, assessing sentiment and potential biases....['dataset', 'model']{}
validmind.model_validation.BleuScoreBleu ScoreEvaluates the quality of machine-generated text using BLEU metrics and visualizes the results through histograms...['dataset', 'model']{}
validmind.model_validation.TimeSeriesPredictionWithCITime Series Prediction With CIPlot actual vs predicted values for a time series with confidence intervals and compute breaches....['dataset', 'model']{'confidence': 0.95}
validmind.model_validation.RegressionResidualsPlotRegression Residuals PlotEvaluates regression model performance using residual distribution and actual vs. predicted plots....['model', 'dataset']{'bin_size': 0.1}
validmind.model_validation.FeaturesAUCFeatures AUCEvaluates the discriminatory power of each individual feature within a binary classification model by calculating the Area Under the Curve (AUC) for each feature separately....['model', 'dataset']{'fontsize': 12, 'figure_height': 500}
validmind.model_validation.ContextualRecallContextual RecallEvaluates a Natural Language Generation model's ability to generate contextually relevant and factually correct text, visualizing the results through histograms and bar charts, alongside compiling a comprehensive table of descriptive statistics for contextual recall scores....['dataset', 'model']{}
validmind.model_validation.MeteorScoreMeteor ScoreComputes and visualizes the METEOR score for each text generation instance, assessing translation quality....['dataset', 'model']{}
validmind.model_validation.RougeScoreRouge ScoreEvaluates the quality of machine-generated text using ROUGE metrics and visualizes the results through histograms...['dataset', 'model']{'metric': 'rouge-1'}
validmind.model_validation.ModelMetadataModel MetadataExtracts and summarizes critical metadata from a machine learning model instance for comprehensive analysis....['model']None
validmind.model_validation.ClusterSizeDistributionCluster Size DistributionCompares and visualizes the distribution of cluster sizes in model predictions and actual data for assessing...['model', 'dataset']None
validmind.model_validation.TokenDisparityToken DisparityEvaluates the token disparity between reference and generated texts, visualizing the results through histograms...['dataset', 'model']{}
validmind.model_validation.ToxicityScoreToxicity ScoreComputes and visualizes the toxicity score for input text, true text, and predicted text, assessing content quality and potential risk....['dataset', 'model']{}
validmind.model_validation.ModelMetadataComparisonModel Metadata ComparisonCompare metadata of different models and generate a summary table with the results....['models']{}
validmind.model_validation.TimeSeriesR2SquareBySegmentsTime Series R2 Square By SegmentsPlot R-Squared values for each model over specified time segments and generate a bar chart...['datasets', 'models']{'segments': None}
validmind.model_validation.embeddings.CosineSimilarityComparisonCosine Similarity ComparisonComputes pairwise cosine similarities between model embeddings and visualizes the results through bar charts,...['dataset', 'models']{}
validmind.model_validation.embeddings.EmbeddingsVisualization2DEmbeddings Visualization2 DVisualizes 2D representation of text embeddings generated by a model using t-SNE technique....['model', 'dataset']{'cluster_column': None, 'perplexity': 30}
validmind.model_validation.embeddings.StabilityAnalysisRandomNoiseStability Analysis Random NoiseEvaluate robustness of embeddings models to random noise introduced by using...['model', 'dataset']{'mean_similarity_threshold': 0.7, 'probability': 0.02}
validmind.model_validation.embeddings.TSNEComponentsPairwisePlotsTSNE Components Pairwise PlotsPlots individual scatter plots for pairwise combinations of t-SNE components of embeddings....['dataset', 'model']{'n_components': 2, 'perplexity': 30, 'title': 't-SNE'}
validmind.model_validation.embeddings.CosineSimilarityDistributionCosine Similarity DistributionAssesses the similarity between predicted text embeddings from a model using a Cosine Similarity distribution...['model', 'dataset']None
validmind.model_validation.embeddings.PCAComponentsPairwisePlotsPCA Components Pairwise PlotsGenerates scatter plots for pairwise combinations of principal component analysis (PCA) components of model embeddings....['dataset', 'model']{'n_components': 3}
validmind.model_validation.embeddings.CosineSimilarityHeatmapCosine Similarity HeatmapGenerates an interactive heatmap to visualize the cosine similarities among embeddings derived from a given model....['dataset', 'model']{'title': 'Cosine Similarity Matrix', 'color': 'Cosine Similarity', 'xaxis_title': 'Index', 'yaxis_title': 'Index', 'color_scale': 'Blues'}
validmind.model_validation.embeddings.StabilityAnalysisTranslationStability Analysis TranslationEvaluate robustness of embeddings models to noise introduced by translating...['model', 'dataset']{'source_lang': 'en', 'target_lang': 'fr', 'mean_similarity_threshold': 0.7}
validmind.model_validation.embeddings.EuclideanDistanceComparisonEuclidean Distance ComparisonComputes pairwise Euclidean distances between model embeddings and visualizes the results through bar charts,...['dataset', 'models']{}
validmind.model_validation.embeddings.ClusterDistributionCluster DistributionAssesses the distribution of text embeddings across clusters produced by a model using KMeans clustering....['model', 'dataset']{'num_clusters': 5}
validmind.model_validation.embeddings.EuclideanDistanceHeatmapEuclidean Distance HeatmapGenerates an interactive heatmap to visualize the Euclidean distances among embeddings derived from a given model....['dataset', 'model']{'title': 'Euclidean Distance Matrix', 'color': 'Euclidean Distance', 'xaxis_title': 'Index', 'yaxis_title': 'Index', 'color_scale': 'Blues'}
validmind.model_validation.embeddings.StabilityAnalysisStability AnalysisBase class for embeddings stability analysis tests['model', 'dataset']{'mean_similarity_threshold': 0.7}
validmind.model_validation.embeddings.StabilityAnalysisKeywordStability Analysis KeywordEvaluate robustness of embeddings models to keyword swaps on the test dataset...['model', 'dataset']{'keyword_dict': None, 'mean_similarity_threshold': 0.7}
validmind.model_validation.embeddings.StabilityAnalysisSynonymsStability Analysis SynonymsEvaluates the stability of text embeddings models when words in test data are replaced by their synonyms randomly....['model', 'dataset']{'probability': 0.02, 'mean_similarity_threshold': 0.7}
validmind.model_validation.embeddings.DescriptiveAnalyticsDescriptive AnalyticsEvaluates statistical properties of text embeddings in an ML model via mean, median, and standard deviation...['model', 'dataset']None
validmind.model_validation.ragas.ContextEntityRecallContext Entity RecallEvaluates the context entity recall for dataset entries and visualizes the results....['dataset']{'contexts_column': 'contexts', 'ground_truth_column': 'ground_truth'}
validmind.model_validation.ragas.FaithfulnessFaithfulnessEvaluates the faithfulness of the generated answers with respect to retrieved contexts....['dataset']{'answer_column': 'answer', 'contexts_column': 'contexts'}
validmind.model_validation.ragas.AspectCritiqueAspect CritiqueEvaluates generations against the following aspects: harmfulness, maliciousness,...['dataset']{'question_column': 'question', 'answer_column': 'answer', 'contexts_column': 'contexts', 'aspects': ['coherence', 'conciseness', 'correctness', 'harmfulness', 'maliciousness'], 'additional_aspects': None}
validmind.model_validation.ragas.AnswerSimilarityAnswer SimilarityCalculates the semantic similarity between generated answers and ground truths...['dataset']{'answer_column': 'answer', 'ground_truth_column': 'ground_truth'}
validmind.model_validation.ragas.AnswerCorrectnessAnswer CorrectnessEvaluates the correctness of answers in a dataset with respect to the provided ground...['dataset']{'question_column': 'question', 'answer_column': 'answer', 'ground_truth_column': 'ground_truth'}
validmind.model_validation.ragas.ContextRecallContext RecallContext recall measures the extent to which the retrieved context aligns with the...['dataset']{'question_column': 'question', 'contexts_column': 'contexts', 'ground_truth_column': 'ground_truth'}
validmind.model_validation.ragas.ContextRelevancyContext RelevancyEvaluates the context relevancy metric for entries in a dataset and visualizes the...['dataset']{'question_column': 'question', 'contexts_column': 'contexts'}
validmind.model_validation.ragas.ContextPrecisionContext PrecisionContext Precision is a metric that evaluates whether all of the ground-truth...['dataset']{'question_column': 'question', 'contexts_column': 'contexts', 'ground_truth_column': 'ground_truth'}
validmind.model_validation.ragas.AnswerRelevanceAnswer RelevanceAssesses how pertinent the generated answer is to the given prompt....['dataset']{'question_column': 'question', 'contexts_column': 'contexts', 'answer_column': 'answer'}
validmind.model_validation.sklearn.RegressionModelsPerformanceComparisonRegression Models Performance ComparisonCompares and evaluates the performance of multiple regression models using five different metrics: MAE, MSE, RMSE,...['dataset', 'models']None
validmind.model_validation.sklearn.AdjustedMutualInformationAdjusted Mutual InformationEvaluates clustering model performance by measuring mutual information between true and predicted labels, adjusting...['model', 'datasets']None
validmind.model_validation.sklearn.SilhouettePlotSilhouette PlotCalculates and visualizes Silhouette Score, assessing degree of data point suitability to its cluster in ML models....['model', 'dataset']None
validmind.model_validation.sklearn.RobustnessDiagnosisRobustness DiagnosisEvaluates the robustness of a machine learning model by injecting Gaussian noise to input data and measuring...['model', 'datasets']{'features_columns': None, 'scaling_factor_std_dev_list': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], 'accuracy_decay_threshold': 4}
validmind.model_validation.sklearn.AdjustedRandIndexAdjusted Rand IndexMeasures the similarity between two data clusters using the Adjusted Rand Index (ARI) metric in clustering machine...['model', 'datasets']None
validmind.model_validation.sklearn.SHAPGlobalImportanceSHAP Global ImportanceEvaluates and visualizes global feature importance using SHAP values for model explanation and risk identification....['model', 'dataset']{'kernel_explainer_samples': 10, 'tree_or_linear_explainer_samples': 200}
validmind.model_validation.sklearn.ConfusionMatrixConfusion MatrixEvaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix...['model', 'dataset']None
validmind.model_validation.sklearn.HomogeneityScoreHomogeneity ScoreAssesses clustering homogeneity by comparing true and predicted labels, scoring from 0 (heterogeneous) to 1...['model', 'datasets']None
validmind.model_validation.sklearn.CompletenessScoreCompleteness ScoreEvaluates a clustering model's capacity to categorize instances from a single class into the same cluster....['model', 'datasets']None
validmind.model_validation.sklearn.OverfitDiagnosisOverfit DiagnosisDetects and visualizes overfit regions in an ML model by comparing performance on training and test datasets....['model', 'datasets']{'features_columns': None, 'cut_off_percentage': 4}
validmind.model_validation.sklearn.ClusterPerformanceMetricsCluster Performance MetricsEvaluates the performance of clustering machine learning models using multiple established metrics....['model', 'datasets']None
validmind.model_validation.sklearn.PermutationFeatureImportancePermutation Feature ImportanceAssesses the significance of each feature in a model by evaluating the impact on model performance when feature...['model', 'dataset']{'fontsize': None, 'figure_height': 1000}
validmind.model_validation.sklearn.FowlkesMallowsScoreFowlkes Mallows ScoreEvaluates the similarity between predicted and actual cluster assignments in a model using the Fowlkes-Mallows...['model', 'datasets']None
validmind.model_validation.sklearn.MinimumROCAUCScoreMinimum ROCAUC ScoreValidates model by checking if the ROC AUC score meets or surpasses a specified threshold....['model', 'dataset']{'min_threshold': 0.5}
validmind.model_validation.sklearn.ClusterCosineSimilarityCluster Cosine SimilarityMeasures the intra-cluster similarity of a clustering model using cosine similarity....['model', 'dataset']None
validmind.model_validation.sklearn.PrecisionRecallCurvePrecision Recall CurveEvaluates the precision-recall trade-off for binary classification models and visualizes the Precision-Recall curve....['model', 'dataset']None
validmind.model_validation.sklearn.ClassifierPerformanceClassifier PerformanceEvaluates performance of binary or multiclass classification models using precision, recall, F1-Score, accuracy,...['model', 'dataset']None
validmind.model_validation.sklearn.VMeasureV MeasureEvaluates homogeneity and completeness of a clustering model using the V Measure Score....['model', 'datasets']None
validmind.model_validation.sklearn.MinimumF1ScoreMinimum F1 ScoreEvaluates if the model's F1 score on the validation set meets a predefined minimum threshold....['model', 'dataset']{'min_threshold': 0.5}
validmind.model_validation.sklearn.ROCCurveROC CurveEvaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic...['model', 'dataset']None
validmind.model_validation.sklearn.RegressionR2SquareRegression R2 Square**Purpose**: The purpose of the RegressionR2Square Metric test is to measure the overall goodness-of-fit of a...['model', 'datasets']None
validmind.model_validation.sklearn.RegressionErrorsRegression Errors**Purpose**: This metric is used to measure the performance of a regression model. It gauges the model's accuracy...['model', 'datasets']None
validmind.model_validation.sklearn.ClusterPerformanceCluster PerformanceEvaluates and compares a clustering model's performance on training and testing datasets using multiple defined...['model', 'datasets']None
validmind.model_validation.sklearn.FeatureImportanceComparisonFeature Importance ComparisonCompare feature importance scores for each model and generate a summary table...['datasets', 'models']{'num_features': 3}
validmind.model_validation.sklearn.TrainingTestDegradationTraining Test DegradationTests if model performance degradation between training and test datasets exceeds a predefined threshold....['model', 'datasets']{'metrics': ['accuracy', 'precision', 'recall', 'f1'], 'max_threshold': 0.1}
validmind.model_validation.sklearn.RegressionErrorsComparisonRegression Errors ComparisonCompare regression error metrics for each model and generate a summary table...['datasets', 'models']{}
validmind.model_validation.sklearn.HyperParametersTuningHyper Parameters TuningExerts exhaustive grid search to identify optimal hyperparameters for the model, improving performance....['model', 'dataset']{'param_grid': None, 'scoring': None}
validmind.model_validation.sklearn.KMeansClustersOptimizationK Means Clusters OptimizationOptimizes the number of clusters in K-means models using Elbow and Silhouette methods....['model', 'dataset']{'n_clusters': None}
validmind.model_validation.sklearn.ModelsPerformanceComparisonModels Performance ComparisonEvaluates and compares the performance of multiple Machine Learning models using various metrics like accuracy,...['dataset', 'models']None
validmind.model_validation.sklearn.WeakspotsDiagnosisWeakspots DiagnosisIdentifies and visualizes weak spots in a machine learning model's performance across various sections of the...['model', 'datasets']{'features_columns': None, 'thresholds': {'accuracy': 0.75, 'precision': 0.5, 'recall': 0.5, 'f1': 0.7}}
validmind.model_validation.sklearn.RegressionR2SquareComparisonRegression R2 Square ComparisonCompare R-Squared and Adjusted R-Squared values for each model and generate a summary table...['datasets', 'models']{}
validmind.model_validation.sklearn.PopulationStabilityIndexPopulation Stability IndexEvaluates the Population Stability Index (PSI) to quantify the stability of an ML model's predictions across...['model', 'datasets']{'num_bins': 10, 'mode': 'fixed'}
validmind.model_validation.sklearn.MinimumAccuracyMinimum AccuracyChecks if the model's prediction accuracy meets or surpasses a specified threshold....['model', 'dataset']{'min_threshold': 0.7}
validmind.model_validation.statsmodels.RegressionModelsCoeffsRegression Models CoeffsCompares feature importance by evaluating and contrasting coefficients of different regression models....['models']None
validmind.model_validation.statsmodels.BoxPierceBox PierceDetects autocorrelation in time-series data through the Box-Pierce test to validate model performance....['dataset']None
validmind.model_validation.statsmodels.RegressionCoeffsPlotRegression Coeffs PlotVisualizes regression coefficients with 95% confidence intervals to assess predictor variables' impact on response...['models']None
validmind.model_validation.statsmodels.RegressionModelSensitivityPlotRegression Model Sensitivity PlotTests the sensitivity of a regression model to variations in independent variables by applying shocks and...['models', 'datasets']{'transformation': None, 'shocks': [0.1]}
validmind.model_validation.statsmodels.RegressionModelForecastPlotLevelsRegression Model Forecast Plot LevelsCompares and visualizes forecasted and actual values of regression models on both raw and transformed datasets....['models', 'datasets']{'transformation': None}
validmind.model_validation.statsmodels.ScorecardHistogramScorecard HistogramCreates histograms of credit scores, from both default and non-default instances, generated by a credit-risk model....['datasets']{'title': 'Histogram of Scores', 'score_column': 'score'}
validmind.model_validation.statsmodels.LJungBoxL Jung BoxAssesses autocorrelations in dataset features by performing a Ljung-Box test on each feature....['dataset']None
validmind.model_validation.statsmodels.JarqueBeraJarque BeraAssesses normality of dataset features in an ML model using the Jarque-Bera test....['dataset']None
validmind.model_validation.statsmodels.KolmogorovSmirnovKolmogorov SmirnovExecutes a feature-wise Kolmogorov-Smirnov test to evaluate alignment with normal distribution in datasets....['dataset']{'dist': 'norm'}
validmind.model_validation.statsmodels.ShapiroWilkShapiro WilkEvaluates feature-wise normality of training data using the Shapiro-Wilk test....['dataset']None
validmind.model_validation.statsmodels.CumulativePredictionProbabilitiesCumulative Prediction ProbabilitiesVisualizes cumulative probabilities of positive and negative classes for both training and testing in logistic...['model', 'datasets']{'title': 'Cumulative Probabilities'}
validmind.model_validation.statsmodels.RegressionFeatureSignificanceRegression Feature SignificanceAssesses and visualizes the statistical significance of features in a set of regression models....['models']{'fontsize': 10, 'p_threshold': 0.05}
validmind.model_validation.statsmodels.RegressionModelSummaryRegression Model SummaryEvaluates regression model performance using metrics including R-Squared, Adjusted R-Squared, MSE, and RMSE....['model', 'dataset']None
validmind.model_validation.statsmodels.LillieforsLillieforsAssesses the normality of feature distributions in an ML model's training dataset using the Lilliefors test....['dataset']None
validmind.model_validation.statsmodels.RunsTestRuns TestExecutes Runs Test on ML model to detect non-random patterns in output data sequence....['dataset']None
validmind.model_validation.statsmodels.RegressionPermutationFeatureImportanceRegression Permutation Feature ImportanceAssesses the significance of each feature in a model by evaluating the impact on model performance when feature...['model', 'dataset']{'fontsize': 12, 'figure_height': 500}
validmind.model_validation.statsmodels.PredictionProbabilitiesHistogramPrediction Probabilities HistogramGenerates and visualizes histograms of the Probability of Default predictions for both positive and negative...['model', 'datasets']{'title': 'Histogram of Predictive Probabilities'}
validmind.model_validation.statsmodels.AutoARIMAAuto ARIMAEvaluates ARIMA models for time-series forecasting, ranking them using Bayesian and Akaike Information Criteria....['dataset']None
validmind.model_validation.statsmodels.GINITableGINI TableEvaluates classification model performance using AUC, GINI, and KS metrics for training and test datasets....['model', 'datasets']None
validmind.model_validation.statsmodels.RegressionModelForecastPlotRegression Model Forecast PlotGenerates plots to visually compare the forecasted outcomes of one or more regression models against actual...['models', 'datasets']{'start_date': None, 'end_date': None}
validmind.model_validation.statsmodels.DurbinWatsonTestDurbin Watson TestAssesses autocorrelation in time series data features using the Durbin-Watson statistic....['dataset']None
validmind.data_validation.MissingValuesRiskMissing Values RiskAssesses and quantifies the risk related to missing values in a dataset used for training an ML model....['dataset']None
validmind.data_validation.IQROutliersTableIQR Outliers TableDetermines and summarizes outliers in numerical features using Interquartile Range method....['dataset']{'features': None, 'threshold': 1.5}
validmind.data_validation.BivariateFeaturesBarPlotsBivariate Features Bar PlotsGenerates visual bar plots to analyze the relationship between paired features within categorical data in the model....['dataset']{'features_pairs': None}
validmind.data_validation.SkewnessSkewnessEvaluates the skewness of numerical data in a machine learning model and checks if it falls below a set maximum...['dataset']{'max_threshold': 1}
validmind.data_validation.DuplicatesDuplicatesTests dataset for duplicate entries, ensuring model reliability via data quality verification....['dataset']{'min_threshold': 1}
validmind.data_validation.MissingValuesBarPlotMissing Values Bar PlotCreates a bar plot showcasing the percentage of missing values in each column of the dataset with risk...['dataset']{'threshold': 80, 'fig_height': 600}
validmind.data_validation.DatasetDescriptionDataset DescriptionProvides comprehensive analysis and statistical summaries of each field in a machine learning model's dataset....['dataset']None
validmind.data_validation.ZivotAndrewsArchZivot Andrews ArchEvaluates the order of integration and stationarity of time series data using Zivot-Andrews unit root test....['dataset']None
validmind.data_validation.ScatterPlotScatter PlotCreates a scatter plot matrix to visually analyze feature relationships, patterns, and outliers in a dataset....['dataset']None
validmind.data_validation.TimeSeriesOutliersTime Series OutliersIdentifies and visualizes outliers in time-series data using z-score method....['dataset']{'zscore_threshold': 3}
validmind.data_validation.TabularCategoricalBarPlotsTabular Categorical Bar PlotsGenerates and visualizes bar plots for each category in categorical features to evaluate dataset's composition....['dataset']None
validmind.data_validation.AutoStationarityAuto StationarityAutomates Augmented Dickey-Fuller test to assess stationarity across multiple time series in a DataFrame....['dataset']{'max_order': 5, 'threshold': 0.05}
validmind.data_validation.DescriptiveStatisticsDescriptive StatisticsPerforms a detailed descriptive statistical analysis of both numerical and categorical data within a model's...['dataset']None
validmind.data_validation.TimeSeriesDescriptionTime Series DescriptionGenerates a detailed analysis for the provided time series dataset....['dataset']{}
validmind.data_validation.ANOVAOneWayTableANOVA One Way TableApplies one-way ANOVA (Analysis of Variance) to identify statistically significant numerical features in the...['dataset']{'features': None, 'p_threshold': 0.05}
validmind.data_validation.TargetRateBarPlotsTarget Rate Bar PlotsGenerates bar plots visualizing the default rates of categorical features for a classification machine learning...['dataset']{'default_column': None, 'columns': None}
validmind.data_validation.PearsonCorrelationMatrixPearson Correlation MatrixEvaluates linear dependency between numerical variables in a dataset via a Pearson Correlation coefficient heat map....['dataset']None
validmind.data_validation.FeatureTargetCorrelationPlotFeature Target Correlation PlotVisualizes the correlation between input features and model's target output in a color-coded horizontal bar plot....['dataset']{'features': None, 'fig_height': 600}
validmind.data_validation.TabularNumericalHistogramsTabular Numerical HistogramsGenerates histograms for each numerical feature in a dataset to provide visual insights into data distribution and...['dataset']None
validmind.data_validation.IsolationForestOutliersIsolation Forest OutliersDetects outliers in a dataset using the Isolation Forest algorithm and visualizes results through scatter plots....['dataset']{'random_state': 0, 'contamination': 0.1, 'features_columns': None}
validmind.data_validation.ChiSquaredFeaturesTableChi Squared Features TableExecutes Chi-Squared test for each categorical feature against a target column to assess significant association....['dataset']{'cat_features': None, 'p_threshold': 0.05}
validmind.data_validation.HighCardinalityHigh CardinalityAssesses the number of unique values in categorical columns to detect high cardinality and potential overfitting....['dataset']{'num_threshold': 100, 'percent_threshold': 0.1, 'threshold_type': 'percent'}
validmind.data_validation.MissingValuesMissing ValuesEvaluates dataset quality by ensuring missing value ratio across all features does not exceed a set threshold....['dataset']{'min_threshold': 1}
validmind.data_validation.PhillipsPerronArchPhillips Perron ArchExecutes Phillips-Perron test to assess the stationarity of time series data in each ML model feature....['dataset']None
validmind.data_validation.RollingStatsPlotRolling Stats PlotThis test evaluates the stationarity of time series data by plotting its rolling mean and standard deviation....['dataset']{'window_size': 12}
validmind.data_validation.TabularDescriptionTablesTabular Description TablesSummarizes key descriptive statistics for numerical, categorical, and datetime variables in a dataset....['dataset']None
validmind.data_validation.AutoMAAuto MAAutomatically selects the optimal Moving Average (MA) order for each variable in a time series dataset based on...['dataset']{'max_ma_order': 3}
validmind.data_validation.UniqueRowsUnique RowsVerifies the diversity of the dataset by ensuring that the count of unique rows exceeds a prescribed threshold....['dataset']{'min_percent_threshold': 1}
validmind.data_validation.TooManyZeroValuesToo Many Zero ValuesIdentifies numerical columns in a dataset that contain an excessive number of zero values, defined by a threshold...['dataset']{'max_percent_threshold': 0.03}
validmind.data_validation.HighPearsonCorrelationHigh Pearson CorrelationIdentifies highly correlated feature pairs in a dataset suggesting feature redundancy or multicollinearity....['dataset']{'max_threshold': 0.3}
validmind.data_validation.ACFandPACFPlotAC Fand PACF PlotAnalyzes time series data using Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots to...['dataset']None
validmind.data_validation.BivariateHistogramsBivariate HistogramsGenerates bivariate histograms for paired features, aiding in visual inspection of categorical variables'...['dataset']{'features_pairs': None, 'target_filter': None}
validmind.data_validation.WOEBinTableWOE Bin TableCalculates and assesses the Weight of Evidence (WoE) and Information Value (IV) of each feature in a ML model....['dataset']{'breaks_adj': None}
validmind.data_validation.HeatmapFeatureCorrelationsHeatmap Feature CorrelationsCreates a heatmap to visually represent correlation patterns between pairs of numerical features in a dataset....['dataset']{'declutter': None, 'fontsize': None, 'num_features': None}
validmind.data_validation.TimeSeriesFrequencyTime Series FrequencyEvaluates consistency of time series data frequency and generates a frequency plot....['dataset']None
validmind.data_validation.DatasetSplitDataset SplitEvaluates and visualizes the distribution proportions among training, testing, and validation datasets of an ML...['datasets']None
validmind.data_validation.SpreadPlotSpread PlotVisualizes the spread relationship between pairs of time-series variables in a dataset, thereby aiding in...['dataset']None
validmind.data_validation.TimeSeriesLinePlotTime Series Line PlotGenerates and analyses time-series data through line plots revealing trends, patterns, anomalies over time....['dataset']None
validmind.data_validation.KPSSKPSSExecutes KPSS unit root test to validate stationarity of time-series data in machine learning model....['dataset']None
validmind.data_validation.AutoSeasonalityAuto SeasonalityAutomatically identifies and quantifies optimal seasonality in time series data to improve forecasting model...['dataset']{'min_period': 1, 'max_period': 4}
validmind.data_validation.BivariateScatterPlotsBivariate Scatter PlotsGenerates bivariate scatterplots to visually inspect relationships between pairs of predictor variables in machine...['dataset']{'selected_columns': None}
validmind.data_validation.EngleGrangerCointEngle Granger CointValidates co-integration in pairs of time series data using the Engle-Granger test and classifies them as...['dataset']{'threshold': 0.05}
validmind.data_validation.TimeSeriesMissingValuesTime Series Missing ValuesValidates time-series data quality by confirming the count of missing values is below a certain threshold....['dataset']{'min_threshold': 1}
validmind.data_validation.TimeSeriesHistogramTime Series HistogramVisualizes distribution of time-series data using histograms and Kernel Density Estimation (KDE) lines....['dataset']{'nbins': 30}
validmind.data_validation.LaggedCorrelationHeatmapLagged Correlation HeatmapAssesses and visualizes correlation between target variable and lagged independent variables in a time-series...['dataset']None
validmind.data_validation.SeasonalDecomposeSeasonal DecomposeDecomposes dataset features into observed, trend, seasonal, and residual components to identify patterns and...['dataset']{'seasonal_model': 'additive'}
validmind.data_validation.WOEBinPlotsWOE Bin PlotsGenerates visualizations of Weight of Evidence (WoE) and Information Value (IV) for understanding predictive power...['dataset']{'breaks_adj': None, 'fig_height': 600, 'fig_width': 500}
validmind.data_validation.ClassImbalanceClass ImbalanceEvaluates and quantifies class distribution imbalance in a dataset used by a machine learning model....['dataset']{'min_percent_threshold': 10}
validmind.data_validation.IQROutliersBarPlotIQR Outliers Bar PlotVisualizes outlier distribution across percentiles in numerical data using Interquartile Range (IQR) method....['dataset']{'threshold': 1.5, 'num_features': None, 'fig_width': 800}
validmind.data_validation.DFGLSArchDFGLS ArchExecutes Dickey-Fuller GLS metric to determine order of integration and check stationarity in time series data....['dataset']None
validmind.data_validation.TimeSeriesDescriptiveStatisticsTime Series Descriptive StatisticsGenerates a detailed table of descriptive statistics for the provided time series dataset....['dataset']{}
validmind.data_validation.AutoARAuto ARAutomatically identifies the optimal Autoregressive (AR) order for a time series using BIC and AIC criteria....['dataset']{'max_ar_order': 3}
validmind.data_validation.TabularDateTimeHistogramsTabular Date Time HistogramsGenerates histograms to provide graphical insight into the distribution of time intervals in model's datetime data....['dataset']None
validmind.data_validation.ADFADFAssesses the stationarity of a time series dataset using the Augmented Dickey-Fuller (ADF) test....['dataset']None
validmind.data_validation.nlp.ToxicityToxicityAnalyzes the toxicity of text data within a dataset using a pre-trained toxicity model....['dataset']{}
validmind.data_validation.nlp.PolarityAndSubjectivityPolarity And SubjectivityAnalyzes the polarity and subjectivity of text data within a dataset....['dataset']{}
validmind.data_validation.nlp.PunctuationsPunctuationsAnalyzes and visualizes the frequency distribution of punctuation usage in a given text dataset....['dataset']None
validmind.data_validation.nlp.SentimentSentimentAnalyzes the sentiment of text data within a dataset using the VADER sentiment analysis tool....['dataset']{}
validmind.data_validation.nlp.CommonWordsCommon WordsIdentifies and visualizes the 40 most frequent non-stopwords in a specified text column within a dataset....['dataset']None
validmind.data_validation.nlp.HashtagsHashtagsAssesses hashtag frequency in a text column, highlighting usage trends and potential dataset bias or spam....['dataset']{'top_hashtags': 25}
validmind.data_validation.nlp.LanguageDetectionLanguage DetectionDetects the language of each text entry in a dataset and visualizes the distribution of languages...['dataset']{}
validmind.data_validation.nlp.MentionsMentionsCalculates and visualizes frequencies of '@' prefixed mentions in a text-based dataset for NLP model analysis....['dataset']{'top_mentions': 25}
validmind.data_validation.nlp.TextDescriptionText DescriptionPerforms comprehensive textual analysis on a dataset using NLTK, evaluating various parameters and generating...['dataset']{'unwanted_tokens': {' ', 'dollar', \"''\", 's', 'us', 'ms', \"s'\", '``', 'mr', 'mrs', \"'s\", 'dr'}, 'num_top_words': 3, 'lang': 'english'}
validmind.data_validation.nlp.StopWordsStop WordsEvaluates and visualizes the frequency of English stop words in a text dataset against a defined threshold....['dataset']{'min_percent_threshold': 0.5, 'num_words': 25}validmind.data_validation.ACFandPACFPlotAC Fand PACF PlotAnalyzes time series data using Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots to...['dataset']{}['time_series_data', 'forecasting', 'statistical_test', 'visualization']['regression']
validmind.data_validation.ADFADFAssesses the stationarity of a time series dataset using the Augmented Dickey-Fuller (ADF) test....['dataset']{}['time_series_data', 'statsmodels', 'forecasting', 'statistical_test', 'stationarity']['regression']
validmind.data_validation.AutoARAuto ARAutomatically identifies the optimal Autoregressive (AR) order for a time series using BIC and AIC criteria....['dataset']{'max_ar_order': {'type': 'int', 'default': 3}}['time_series_data', 'statsmodels', 'forecasting', 'statistical_test']['regression']
validmind.data_validation.AutoMAAuto MAAutomatically selects the optimal Moving Average (MA) order for each variable in a time series dataset based on...['dataset']{'max_ma_order': {'type': 'int', 'default': 3}}['time_series_data', 'statsmodels', 'forecasting', 'statistical_test']['regression']
validmind.data_validation.AutoStationarityAuto StationarityAutomates Augmented Dickey-Fuller test to assess stationarity across multiple time series in a DataFrame....['dataset']{'max_order': {'type': 'int', 'default': 5}, 'threshold': {'type': 'float', 'default': 0.05}}['time_series_data', 'statsmodels', 'forecasting', 'statistical_test']['regression']
validmind.data_validation.BivariateScatterPlotsBivariate Scatter PlotsGenerates bivariate scatterplots to visually inspect relationships between pairs of numerical predictor variables...['dataset']{}['tabular_data', 'numerical_data', 'visualization']['classification']
validmind.data_validation.BoxPierceBox PierceDetects autocorrelation in time-series data through the Box-Pierce test to validate model performance....['dataset']{}['time_series_data', 'forecasting', 'statistical_test', 'statsmodels']['regression']
validmind.data_validation.ChiSquaredFeaturesTableChi Squared Features TableAssesses the statistical association between categorical features and a target variable using the Chi-Squared test....['dataset']{'p_threshold': {'type': '_empty', 'default': 0.05}}['tabular_data', 'categorical_data', 'statistical_test']['classification']
validmind.data_validation.ClassImbalanceClass ImbalanceEvaluates and quantifies class distribution imbalance in a dataset used by a machine learning model....['dataset']{'min_percent_threshold': {'type': 'int', 'default': 10}}['tabular_data', 'binary_classification', 'multiclass_classification', 'data_quality']['classification']
validmind.data_validation.DatasetDescriptionDataset DescriptionProvides comprehensive analysis and statistical summaries of each column in a machine learning model's dataset....['dataset']{}['tabular_data', 'time_series_data', 'text_data']['classification', 'regression', 'text_classification', 'text_summarization']
validmind.data_validation.DatasetSplitDataset SplitEvaluates and visualizes the distribution proportions among training, testing, and validation datasets of an ML...['datasets']{}['tabular_data', 'time_series_data', 'text_data']['classification', 'regression', 'text_classification', 'text_summarization']
validmind.data_validation.DescriptiveStatisticsDescriptive StatisticsPerforms a detailed descriptive statistical analysis of both numerical and categorical data within a model's...['dataset']{}['tabular_data', 'time_series_data', 'data_quality']['classification', 'regression']
validmind.data_validation.DickeyFullerGLSDickey Fuller GLSAssesses stationarity in time series data using the Dickey-Fuller GLS test to determine the order of integration....['dataset']{}['time_series_data', 'forecasting', 'unit_root_test']['regression']
validmind.data_validation.DuplicatesDuplicatesTests dataset for duplicate entries, ensuring model reliability via data quality verification....['dataset']{'min_threshold': {'type': '_empty', 'default': 1}}['tabular_data', 'data_quality', 'text_data']['classification', 'regression']
validmind.data_validation.EngleGrangerCointEngle Granger CointAssesses the degree of co-movement between pairs of time series data using the Engle-Granger cointegration test....['dataset']{'threshold': {'type': 'float', 'default': 0.05}}['time_series_data', 'statistical_test', 'forecasting']['regression']
validmind.data_validation.FeatureTargetCorrelationPlotFeature Target Correlation PlotVisualizes the correlation between input features and the model's target output in a color-coded horizontal bar...['dataset']{'fig_height': {'type': '_empty', 'default': 600}}['tabular_data', 'visualization', 'correlation']['classification', 'regression']
validmind.data_validation.HighCardinalityHigh CardinalityAssesses the number of unique values in categorical columns to detect high cardinality and potential overfitting....['dataset']{'num_threshold': {'type': 'int', 'default': 100}, 'percent_threshold': {'type': 'float', 'default': 0.1}, 'threshold_type': {'type': 'str', 'default': 'percent'}}['tabular_data', 'data_quality', 'categorical_data']['classification', 'regression']
validmind.data_validation.HighPearsonCorrelationHigh Pearson CorrelationIdentifies highly correlated feature pairs in a dataset suggesting feature redundancy or multicollinearity....['dataset']{'max_threshold': {'type': 'float', 'default': 0.3}, 'top_n_correlations': {'type': 'int', 'default': 10}, 'feature_columns': {'type': 'list', 'default': None}}['tabular_data', 'data_quality', 'correlation']['classification', 'regression']
validmind.data_validation.IQROutliersBarPlotIQR Outliers Bar PlotVisualizes outlier distribution across percentiles in numerical data using the Interquartile Range (IQR) method....['dataset']{'threshold': {'type': 'float', 'default': 1.5}, 'fig_width': {'type': 'int', 'default': 800}}['tabular_data', 'visualization', 'numerical_data']['classification', 'regression']
validmind.data_validation.IQROutliersTableIQR Outliers TableDetermines and summarizes outliers in numerical features using the Interquartile Range method....['dataset']{'threshold': {'type': 'float', 'default': 1.5}}['tabular_data', 'numerical_data']['classification', 'regression']
validmind.data_validation.IsolationForestOutliersIsolation Forest OutliersDetects outliers in a dataset using the Isolation Forest algorithm and visualizes results through scatter plots....['dataset']{'random_state': {'type': 'int', 'default': 0}, 'contamination': {'type': 'float', 'default': 0.1}, 'feature_columns': {'type': 'list', 'default': None}}['tabular_data', 'anomaly_detection']['classification']
validmind.data_validation.JarqueBeraJarque BeraAssesses normality of dataset features in an ML model using the Jarque-Bera test....['dataset']{}['tabular_data', 'data_distribution', 'statistical_test', 'statsmodels']['classification', 'regression']
validmind.data_validation.KPSSKPSSAssesses the stationarity of time-series data in a machine learning model using the KPSS unit root test....['dataset']{}['time_series_data', 'stationarity', 'unit_root_test', 'statsmodels']['data_validation']
validmind.data_validation.LJungBoxL Jung BoxAssesses autocorrelations in dataset features by performing a Ljung-Box test on each feature....['dataset']{}['time_series_data', 'forecasting', 'statistical_test', 'statsmodels']['regression']
validmind.data_validation.LaggedCorrelationHeatmapLagged Correlation HeatmapAssesses and visualizes correlation between target variable and lagged independent variables in a time-series...['dataset']{'num_lags': {'type': 'int', 'default': 10}}['time_series_data', 'visualization']['regression']
validmind.data_validation.MissingValuesMissing ValuesEvaluates dataset quality by ensuring missing value ratio across all features does not exceed a set threshold....['dataset']{'min_threshold': {'type': 'int', 'default': 1}}['tabular_data', 'data_quality']['classification', 'regression']
validmind.data_validation.MissingValuesBarPlotMissing Values Bar PlotAssesses the percentage and distribution of missing values in the dataset via a bar plot, with emphasis on...['dataset']{'threshold': {'type': 'int', 'default': 80}, 'fig_height': {'type': 'int', 'default': 600}}['tabular_data', 'data_quality', 'visualization']['classification', 'regression']
validmind.data_validation.MutualInformationMutual InformationCalculates mutual information scores between features and target variable to evaluate feature relevance....['dataset']{'min_threshold': {'type': 'float', 'default': 0.01}, 'task': {'type': 'str', 'default': 'classification'}}['feature_selection', 'data_analysis']['classification', 'regression']
validmind.data_validation.PearsonCorrelationMatrixPearson Correlation MatrixEvaluates linear dependency between numerical variables in a dataset via a Pearson Correlation coefficient heat map....['dataset']{}['tabular_data', 'numerical_data', 'correlation']['classification', 'regression']
validmind.data_validation.PhillipsPerronArchPhillips Perron ArchAssesses the stationarity of time series data in each feature of the ML model using the Phillips-Perron test....['dataset']{}['time_series_data', 'forecasting', 'statistical_test', 'unit_root_test']['regression']
validmind.data_validation.ProtectedClassesDescriptionProtected Classes DescriptionVisualizes the distribution of protected classes in the dataset relative to the target variable...['dataset']{'protected_classes': {'type': '_empty', 'default': None}}['bias_and_fairness', 'descriptive_statistics']['classification', 'regression']
validmind.data_validation.RollingStatsPlotRolling Stats PlotEvaluates the stationarity of time series data by plotting its rolling mean and standard deviation over a specified...['dataset']{'window_size': {'type': 'int', 'default': 12}}['time_series_data', 'visualization', 'stationarity']['regression']
validmind.data_validation.RunsTestRuns TestExecutes Runs Test on ML model to detect non-random patterns in output data sequence....['dataset']{}['tabular_data', 'statistical_test', 'statsmodels']['classification', 'regression']
validmind.data_validation.ScatterPlotScatter PlotAssesses visual relationships, patterns, and outliers among features in a dataset through scatter plot matrices....['dataset']{}['tabular_data', 'visualization']['classification', 'regression']
validmind.data_validation.ScoreBandDefaultRatesScore Band Default RatesAnalyzes default rates and population distribution across credit score bands....['dataset', 'model']{'score_column': {'type': 'str', 'default': 'score'}, 'score_bands': {'type': 'list', 'default': None}}['visualization', 'credit_risk', 'scorecard']['classification']
validmind.data_validation.SeasonalDecomposeSeasonal DecomposeAssesses patterns and seasonality in a time series dataset by decomposing its features into foundational components....['dataset']{'seasonal_model': {'type': 'str', 'default': 'additive'}}['time_series_data', 'seasonality', 'statsmodels']['regression']
validmind.data_validation.ShapiroWilkShapiro WilkEvaluates feature-wise normality of training data using the Shapiro-Wilk test....['dataset']{}['tabular_data', 'data_distribution', 'statistical_test']['classification', 'regression']
validmind.data_validation.SkewnessSkewnessEvaluates the skewness of numerical data in a dataset to check against a defined threshold, aiming to ensure data...['dataset']{'max_threshold': {'type': '_empty', 'default': 1}}['data_quality', 'tabular_data']['classification', 'regression']
validmind.data_validation.SpreadPlotSpread PlotAssesses potential correlations between pairs of time series variables through visualization to enhance...['dataset']{}['time_series_data', 'visualization']['regression']
validmind.data_validation.TabularCategoricalBarPlotsTabular Categorical Bar PlotsGenerates and visualizes bar plots for each category in categorical features to evaluate the dataset's composition....['dataset']{}['tabular_data', 'visualization']['classification', 'regression']
validmind.data_validation.TabularDateTimeHistogramsTabular Date Time HistogramsGenerates histograms to provide graphical insight into the distribution of time intervals in a model's datetime...['dataset']{}['time_series_data', 'visualization']['classification', 'regression']
validmind.data_validation.TabularDescriptionTablesTabular Description TablesSummarizes key descriptive statistics for numerical, categorical, and datetime variables in a dataset....['dataset']{}['tabular_data']['classification', 'regression']
validmind.data_validation.TabularNumericalHistogramsTabular Numerical HistogramsGenerates histograms for each numerical feature in a dataset to provide visual insights into data distribution and...['dataset']{}['tabular_data', 'visualization']['classification', 'regression']
validmind.data_validation.TargetRateBarPlotsTarget Rate Bar PlotsGenerates bar plots visualizing the default rates of categorical features for a classification machine learning...['dataset']{}['tabular_data', 'visualization', 'categorical_data']['classification']
validmind.data_validation.TimeSeriesDescriptionTime Series DescriptionGenerates a detailed analysis for the provided time series dataset, summarizing key statistics to identify trends,...['dataset']{}['time_series_data', 'analysis']['regression']
validmind.data_validation.TimeSeriesDescriptiveStatisticsTime Series Descriptive StatisticsEvaluates the descriptive statistics of a time series dataset to identify trends, patterns, and data quality issues....['dataset']{}['time_series_data', 'analysis']['regression']
validmind.data_validation.TimeSeriesFrequencyTime Series FrequencyEvaluates consistency of time series data frequency and generates a frequency plot....['dataset']{}['time_series_data']['regression']
validmind.data_validation.TimeSeriesHistogramTime Series HistogramVisualizes distribution of time-series data using histograms and Kernel Density Estimation (KDE) lines....['dataset']{'nbins': {'type': '_empty', 'default': 30}}['data_validation', 'visualization', 'time_series_data']['regression', 'time_series_forecasting']
validmind.data_validation.TimeSeriesLinePlotTime Series Line PlotGenerates and analyses time-series data through line plots revealing trends, patterns, anomalies over time....['dataset']{}['time_series_data', 'visualization']['regression']
validmind.data_validation.TimeSeriesMissingValuesTime Series Missing ValuesValidates time-series data quality by confirming the count of missing values is below a certain threshold....['dataset']{'min_threshold': {'type': 'int', 'default': 1}}['time_series_data']['regression']
validmind.data_validation.TimeSeriesOutliersTime Series OutliersIdentifies and visualizes outliers in time-series data using the z-score method....['dataset']{'zscore_threshold': {'type': 'int', 'default': 3}}['time_series_data']['regression']
validmind.data_validation.TooManyZeroValuesToo Many Zero ValuesIdentifies numerical columns in a dataset that contain an excessive number of zero values, defined by a threshold...['dataset']{'max_percent_threshold': {'type': 'float', 'default': 0.03}}['tabular_data']['regression', 'classification']
validmind.data_validation.UniqueRowsUnique RowsVerifies the diversity of the dataset by ensuring that the count of unique rows exceeds a prescribed threshold....['dataset']{'min_percent_threshold': {'type': 'float', 'default': 1}}['tabular_data']['regression', 'classification']
validmind.data_validation.WOEBinPlotsWOE Bin PlotsGenerates visualizations of Weight of Evidence (WoE) and Information Value (IV) for understanding predictive power...['dataset']{'breaks_adj': {'type': 'list', 'default': None}, 'fig_height': {'type': 'int', 'default': 600}, 'fig_width': {'type': 'int', 'default': 500}}['tabular_data', 'visualization', 'categorical_data']['classification']
validmind.data_validation.WOEBinTableWOE Bin TableAssesses the Weight of Evidence (WoE) and Information Value (IV) of each feature to evaluate its predictive power...['dataset']{'breaks_adj': {'type': 'list', 'default': None}}['tabular_data', 'categorical_data']['classification']
validmind.data_validation.ZivotAndrewsArchZivot Andrews ArchEvaluates the order of integration and stationarity of time series data using the Zivot-Andrews unit root test....['dataset']{}['time_series_data', 'stationarity', 'unit_root_test']['regression']
validmind.data_validation.nlp.CommonWordsCommon WordsAssesses the most frequent non-stopwords in a text column for identifying prevalent language patterns....['dataset']{}['nlp', 'text_data', 'visualization', 'frequency_analysis']['text_classification', 'text_summarization']
validmind.data_validation.nlp.HashtagsHashtagsAssesses hashtag frequency in a text column, highlighting usage trends and potential dataset bias or spam....['dataset']{'top_hashtags': {'type': 'int', 'default': 25}}['nlp', 'text_data', 'visualization', 'frequency_analysis']['text_classification', 'text_summarization']
validmind.data_validation.nlp.LanguageDetectionLanguage DetectionAssesses the diversity of languages in a textual dataset by detecting and visualizing the distribution of languages....['dataset']{}['nlp', 'text_data', 'visualization']['text_classification', 'text_summarization']
validmind.data_validation.nlp.MentionsMentionsCalculates and visualizes frequencies of '@' prefixed mentions in a text-based dataset for NLP model analysis....['dataset']{'top_mentions': {'type': 'int', 'default': 25}}['nlp', 'text_data', 'visualization', 'frequency_analysis']['text_classification', 'text_summarization']
validmind.data_validation.nlp.PolarityAndSubjectivityPolarity And SubjectivityAnalyzes the polarity and subjectivity of text data within a given dataset to visualize the sentiment distribution....['dataset']{'threshold_subjectivity': {'type': '_empty', 'default': 0.5}, 'threshold_polarity': {'type': '_empty', 'default': 0}}['nlp', 'text_data', 'data_validation']['nlp']
validmind.data_validation.nlp.PunctuationsPunctuationsAnalyzes and visualizes the frequency distribution of punctuation usage in a given text dataset....['dataset']{'count_mode': {'type': '_empty', 'default': 'token'}}['nlp', 'text_data', 'visualization', 'frequency_analysis']['text_classification', 'text_summarization', 'nlp']
validmind.data_validation.nlp.SentimentSentimentAnalyzes the sentiment of text data within a dataset using the VADER sentiment analysis tool....['dataset']{}['nlp', 'text_data', 'data_validation']['nlp']
validmind.data_validation.nlp.StopWordsStop WordsEvaluates and visualizes the frequency of English stop words in a text dataset against a defined threshold....['dataset']{'min_percent_threshold': {'type': 'float', 'default': 0.5}, 'num_words': {'type': 'int', 'default': 25}}['nlp', 'text_data', 'frequency_analysis', 'visualization']['text_classification', 'text_summarization']
validmind.data_validation.nlp.TextDescriptionText DescriptionConducts comprehensive textual analysis on a dataset using NLTK to evaluate various parameters and generate...['dataset']{'unwanted_tokens': {'type': 'set', 'default': {\"s'\", \"'s\", ' ', 'mr', \"''\", 'dollar', 'dr', 'mrs', '``', 's', 'us', 'ms'}}, 'lang': {'type': 'str', 'default': 'english'}}['nlp', 'text_data', 'visualization']['text_classification', 'text_summarization']
validmind.data_validation.nlp.ToxicityToxicityAssesses the toxicity of text data within a dataset to visualize the distribution of toxicity scores....['dataset']{}['nlp', 'text_data', 'data_validation']['nlp']
validmind.model_validation.BertScoreBert ScoreAssesses the quality of machine-generated text using BERTScore metrics and visualizes results through histograms...['dataset', 'model']{'evaluation_model': {'type': '_empty', 'default': 'distilbert-base-uncased'}}['nlp', 'text_data', 'visualization']['text_classification', 'text_summarization']
validmind.model_validation.BleuScoreBleu ScoreEvaluates the quality of machine-generated text using BLEU metrics and visualizes the results through histograms...['dataset', 'model']{}['nlp', 'text_data', 'visualization']['text_classification', 'text_summarization']
validmind.model_validation.ClusterSizeDistributionCluster Size DistributionAssesses the performance of clustering models by comparing the distribution of cluster sizes in model predictions...['dataset', 'model']{}['sklearn', 'model_performance']['clustering']
validmind.model_validation.ContextualRecallContextual RecallEvaluates a Natural Language Generation model's ability to generate contextually relevant and factually correct...['dataset', 'model']{}['nlp', 'text_data', 'visualization']['text_classification', 'text_summarization']
validmind.model_validation.FeaturesAUCFeatures AUCEvaluates the discriminatory power of each individual feature within a binary classification model by calculating...['dataset']{'fontsize': {'type': 'int', 'default': 12}, 'figure_height': {'type': 'int', 'default': 500}}['feature_importance', 'AUC', 'visualization']['classification']
validmind.model_validation.MeteorScoreMeteor ScoreAssesses the quality of machine-generated translations by comparing them to human-produced references using the...['dataset', 'model']{}['nlp', 'text_data', 'visualization']['text_classification', 'text_summarization']
validmind.model_validation.ModelMetadataModel MetadataCompare metadata of different models and generate a summary table with the results....['model']{}['model_training', 'metadata']['regression', 'time_series_forecasting']
validmind.model_validation.ModelPredictionResidualsModel Prediction ResidualsAssesses normality and behavior of residuals in regression models through visualization and statistical tests....['dataset', 'model']{'nbins': {'type': '_empty', 'default': 100}, 'p_value_threshold': {'type': '_empty', 'default': 0.05}, 'start_date': {'type': '_empty', 'default': None}, 'end_date': {'type': '_empty', 'default': None}}['regression']['residual_analysis', 'visualization']
validmind.model_validation.RegardScoreRegard ScoreAssesses the sentiment and potential biases in text generated by NLP models by computing and visualizing regard...['dataset', 'model']{}['nlp', 'text_data', 'visualization']['text_classification', 'text_summarization']
validmind.model_validation.RegressionResidualsPlotRegression Residuals PlotEvaluates regression model performance using residual distribution and actual vs. predicted plots....['model', 'dataset']{'bin_size': {'type': 'float', 'default': 0.1}}['model_performance', 'visualization']['regression']
validmind.model_validation.RougeScoreRouge ScoreAssesses the quality of machine-generated text using ROUGE metrics and visualizes the results to provide...['dataset', 'model']{'metric': {'type': '_empty', 'default': 'rouge-1'}}['nlp', 'text_data', 'visualization']['text_classification', 'text_summarization']
validmind.model_validation.TimeSeriesPredictionWithCITime Series Prediction With CIAssesses predictive accuracy and uncertainty in time series models, highlighting breaches beyond confidence...['dataset', 'model']{'confidence': {'type': '_empty', 'default': 0.95}}['model_predictions', 'visualization']['regression', 'time_series_forecasting']
validmind.model_validation.TimeSeriesPredictionsPlotTime Series Predictions PlotPlot actual vs predicted values for time series data and generate a visual comparison for the model....['dataset', 'model']{}['model_predictions', 'visualization']['regression', 'time_series_forecasting']
validmind.model_validation.TimeSeriesR2SquareBySegmentsTime Series R2 Square By SegmentsEvaluates the R-Squared values of regression models over specified time segments in time series data to assess...['dataset', 'model']{'segments': {'type': '_empty', 'default': None}}['model_performance', 'sklearn']['regression', 'time_series_forecasting']
validmind.model_validation.TokenDisparityToken DisparityEvaluates the token disparity between reference and generated texts, visualizing the results through histograms and...['dataset', 'model']{}['nlp', 'text_data', 'visualization']['text_classification', 'text_summarization']
validmind.model_validation.ToxicityScoreToxicity ScoreAssesses the toxicity levels of texts generated by NLP models to identify and mitigate harmful or offensive content....['dataset', 'model']{}['nlp', 'text_data', 'visualization']['text_classification', 'text_summarization']
validmind.model_validation.embeddings.ClusterDistributionCluster DistributionAssesses the distribution of text embeddings across clusters produced by a model using KMeans clustering....['model', 'dataset']{'num_clusters': {'type': 'int', 'default': 5}}['llm', 'text_data', 'embeddings', 'visualization']['feature_extraction']
validmind.model_validation.embeddings.CosineSimilarityComparisonCosine Similarity ComparisonAssesses the similarity between embeddings generated by different models using Cosine Similarity, providing both...['dataset', 'models']{}['visualization', 'dimensionality_reduction', 'embeddings']['text_qa', 'text_generation', 'text_summarization']
validmind.model_validation.embeddings.CosineSimilarityDistributionCosine Similarity DistributionAssesses the similarity between predicted text embeddings from a model using a Cosine Similarity distribution...['dataset', 'model']{}['llm', 'text_data', 'embeddings', 'visualization']['feature_extraction']
validmind.model_validation.embeddings.CosineSimilarityHeatmapCosine Similarity HeatmapGenerates an interactive heatmap to visualize the cosine similarities among embeddings derived from a given model....['dataset', 'model']{'title': {'type': '_empty', 'default': 'Cosine Similarity Matrix'}, 'color': {'type': '_empty', 'default': 'Cosine Similarity'}, 'xaxis_title': {'type': '_empty', 'default': 'Index'}, 'yaxis_title': {'type': '_empty', 'default': 'Index'}, 'color_scale': {'type': '_empty', 'default': 'Blues'}}['visualization', 'dimensionality_reduction', 'embeddings']['text_qa', 'text_generation', 'text_summarization']
validmind.model_validation.embeddings.DescriptiveAnalyticsDescriptive AnalyticsEvaluates statistical properties of text embeddings in an ML model via mean, median, and standard deviation...['dataset', 'model']{}['llm', 'text_data', 'embeddings', 'visualization']['feature_extraction']
validmind.model_validation.embeddings.EmbeddingsVisualization2DEmbeddings Visualization2 DVisualizes 2D representation of text embeddings generated by a model using t-SNE technique....['model', 'dataset']{'cluster_column': {'type': None, 'default': None}, 'perplexity': {'type': 'int', 'default': 30}}['llm', 'text_data', 'embeddings', 'visualization']['feature_extraction']
validmind.model_validation.embeddings.EuclideanDistanceComparisonEuclidean Distance ComparisonAssesses and visualizes the dissimilarity between model embeddings using Euclidean distance, providing insights...['dataset', 'models']{}['visualization', 'dimensionality_reduction', 'embeddings']['text_qa', 'text_generation', 'text_summarization']
validmind.model_validation.embeddings.EuclideanDistanceHeatmapEuclidean Distance HeatmapGenerates an interactive heatmap to visualize the Euclidean distances among embeddings derived from a given model....['dataset', 'model']{'title': {'type': '_empty', 'default': 'Euclidean Distance Matrix'}, 'color': {'type': '_empty', 'default': 'Euclidean Distance'}, 'xaxis_title': {'type': '_empty', 'default': 'Index'}, 'yaxis_title': {'type': '_empty', 'default': 'Index'}, 'color_scale': {'type': '_empty', 'default': 'Blues'}}['visualization', 'dimensionality_reduction', 'embeddings']['text_qa', 'text_generation', 'text_summarization']
validmind.model_validation.embeddings.PCAComponentsPairwisePlotsPCA Components Pairwise PlotsGenerates scatter plots for pairwise combinations of principal component analysis (PCA) components of model...['dataset', 'model']{'n_components': {'type': '_empty', 'default': 3}}['visualization', 'dimensionality_reduction', 'embeddings']['text_qa', 'text_generation', 'text_summarization']
validmind.model_validation.embeddings.StabilityAnalysisKeywordStability Analysis KeywordEvaluates robustness of embedding models to keyword swaps in the test dataset....['dataset', 'model']{'keyword_dict': {'type': None, 'default': None}, 'mean_similarity_threshold': {'type': 'float', 'default': 0.7}}['llm', 'text_data', 'embeddings', 'visualization']['feature_extraction']
validmind.model_validation.embeddings.StabilityAnalysisRandomNoiseStability Analysis Random NoiseAssesses the robustness of text embeddings models to random noise introduced via text perturbations....['dataset', 'model']{'probability': {'type': 'float', 'default': 0.02}, 'mean_similarity_threshold': {'type': 'float', 'default': 0.7}}['llm', 'text_data', 'embeddings', 'visualization']['feature_extraction']
validmind.model_validation.embeddings.StabilityAnalysisSynonymsStability Analysis SynonymsEvaluates the stability of text embeddings models when words in test data are replaced by their synonyms randomly....['dataset', 'model']{'probability': {'type': 'float', 'default': 0.02}, 'mean_similarity_threshold': {'type': 'float', 'default': 0.7}}['llm', 'text_data', 'embeddings', 'visualization']['feature_extraction']
validmind.model_validation.embeddings.StabilityAnalysisTranslationStability Analysis TranslationEvaluates robustness of text embeddings models to noise introduced by translating the original text to another...['dataset', 'model']{'source_lang': {'type': 'str', 'default': 'en'}, 'target_lang': {'type': 'str', 'default': 'fr'}, 'mean_similarity_threshold': {'type': 'float', 'default': 0.7}}['llm', 'text_data', 'embeddings', 'visualization']['feature_extraction']
validmind.model_validation.embeddings.TSNEComponentsPairwisePlotsTSNE Components Pairwise PlotsCreates scatter plots for pairwise combinations of t-SNE components to visualize embeddings and highlight potential...['dataset', 'model']{'n_components': {'type': '_empty', 'default': 2}, 'perplexity': {'type': '_empty', 'default': 30}, 'title': {'type': '_empty', 'default': 't-SNE'}}['visualization', 'dimensionality_reduction', 'embeddings']['text_qa', 'text_generation', 'text_summarization']
validmind.model_validation.ragas.AnswerCorrectnessAnswer CorrectnessEvaluates the correctness of answers in a dataset with respect to the provided ground...['dataset']{'user_input_column': {'type': '_empty', 'default': 'user_input'}, 'response_column': {'type': '_empty', 'default': 'response'}, 'reference_column': {'type': '_empty', 'default': 'reference'}}['ragas', 'llm']['text_qa', 'text_generation', 'text_summarization']
validmind.model_validation.ragas.AspectCriticAspect CriticEvaluates generations against the following aspects: harmfulness, maliciousness,...['dataset']{'user_input_column': {'type': '_empty', 'default': 'user_input'}, 'response_column': {'type': '_empty', 'default': 'response'}, 'retrieved_contexts_column': {'type': '_empty', 'default': None}, 'aspects': {'type': 'list', 'default': ['coherence', 'conciseness', 'correctness', 'harmfulness', 'maliciousness']}, 'additional_aspects': {'type': 'list', 'default': None}}['ragas', 'llm', 'qualitative']['text_summarization', 'text_generation', 'text_qa']
validmind.model_validation.ragas.ContextEntityRecallContext Entity RecallEvaluates the context entity recall for dataset entries and visualizes the results....['dataset']{'retrieved_contexts_column': {'type': 'str', 'default': 'retrieved_contexts'}, 'reference_column': {'type': 'str', 'default': 'reference'}}['ragas', 'llm', 'retrieval_performance']['text_qa', 'text_generation', 'text_summarization']
validmind.model_validation.ragas.ContextPrecisionContext PrecisionContext Precision is a metric that evaluates whether all of the ground-truth...['dataset']{'user_input_column': {'type': 'str', 'default': 'user_input'}, 'retrieved_contexts_column': {'type': 'str', 'default': 'retrieved_contexts'}, 'reference_column': {'type': 'str', 'default': 'reference'}}['ragas', 'llm', 'retrieval_performance']['text_qa', 'text_generation', 'text_summarization', 'text_classification']
validmind.model_validation.ragas.ContextPrecisionWithoutReferenceContext Precision Without ReferenceContext Precision Without Reference is a metric used to evaluate the relevance of...['dataset']{'user_input_column': {'type': 'str', 'default': 'user_input'}, 'retrieved_contexts_column': {'type': 'str', 'default': 'retrieved_contexts'}, 'response_column': {'type': 'str', 'default': 'response'}}['ragas', 'llm', 'retrieval_performance']['text_qa', 'text_generation', 'text_summarization', 'text_classification']
validmind.model_validation.ragas.ContextRecallContext RecallContext recall measures the extent to which the retrieved context aligns with the...['dataset']{'user_input_column': {'type': 'str', 'default': 'user_input'}, 'retrieved_contexts_column': {'type': 'str', 'default': 'retrieved_contexts'}, 'reference_column': {'type': 'str', 'default': 'reference'}}['ragas', 'llm', 'retrieval_performance']['text_qa', 'text_generation', 'text_summarization', 'text_classification']
validmind.model_validation.ragas.FaithfulnessFaithfulnessEvaluates the faithfulness of the generated answers with respect to retrieved contexts....['dataset']{'user_input_column': {'type': '_empty', 'default': 'user_input'}, 'response_column': {'type': '_empty', 'default': 'response'}, 'retrieved_contexts_column': {'type': '_empty', 'default': 'retrieved_contexts'}}['ragas', 'llm', 'rag_performance']['text_qa', 'text_generation', 'text_summarization']
validmind.model_validation.ragas.NoiseSensitivityNoise SensitivityAssesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it...['dataset']{'response_column': {'type': '_empty', 'default': 'response'}, 'retrieved_contexts_column': {'type': '_empty', 'default': 'retrieved_contexts'}, 'reference_column': {'type': '_empty', 'default': 'reference'}, 'focus': {'type': '_empty', 'default': 'relevant'}, 'user_input_column': {'type': '_empty', 'default': 'user_input'}}['ragas', 'llm', 'rag_performance']['text_qa', 'text_generation', 'text_summarization']
validmind.model_validation.ragas.ResponseRelevancyResponse RelevancyAssesses how pertinent the generated answer is to the given prompt....['dataset']{'user_input_column': {'type': '_empty', 'default': 'user_input'}, 'retrieved_contexts_column': {'type': '_empty', 'default': None}, 'response_column': {'type': '_empty', 'default': 'response'}}['ragas', 'llm', 'rag_performance']['text_qa', 'text_generation', 'text_summarization']
validmind.model_validation.ragas.SemanticSimilaritySemantic SimilarityCalculates the semantic similarity between generated responses and ground truths...['dataset']{'response_column': {'type': '_empty', 'default': 'response'}, 'reference_column': {'type': '_empty', 'default': 'reference'}}['ragas', 'llm']['text_qa', 'text_generation', 'text_summarization']
validmind.model_validation.sklearn.AdjustedMutualInformationAdjusted Mutual InformationEvaluates clustering model performance by measuring mutual information between true and predicted labels, adjusting...['model', 'dataset']{}['sklearn', 'model_performance', 'clustering']['clustering']
validmind.model_validation.sklearn.AdjustedRandIndexAdjusted Rand IndexMeasures the similarity between two data clusters using the Adjusted Rand Index (ARI) metric in clustering machine...['model', 'dataset']{}['sklearn', 'model_performance', 'clustering']['clustering']
validmind.model_validation.sklearn.CalibrationCurveCalibration CurveEvaluates the calibration of probability estimates by comparing predicted probabilities against observed...['model', 'dataset']{'n_bins': {'type': 'int', 'default': 10}}['sklearn', 'model_performance', 'classification']['classification']
validmind.model_validation.sklearn.ClassifierPerformanceClassifier PerformanceEvaluates performance of binary or multiclass classification models using precision, recall, F1-Score, accuracy,...['dataset', 'model']{'average': {'type': 'str', 'default': 'macro'}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.ClassifierThresholdOptimizationClassifier Threshold OptimizationAnalyzes and visualizes different threshold optimization methods for binary classification models....['dataset', 'model']{'methods': {'type': None, 'default': None}, 'target_recall': {'type': None, 'default': None}}['model_validation', 'threshold_optimization', 'classification_metrics']['classification']
validmind.model_validation.sklearn.ClusterCosineSimilarityCluster Cosine SimilarityMeasures the intra-cluster similarity of a clustering model using cosine similarity....['model', 'dataset']{}['sklearn', 'model_performance', 'clustering']['clustering']
validmind.model_validation.sklearn.ClusterPerformanceMetricsCluster Performance MetricsEvaluates the performance of clustering machine learning models using multiple established metrics....['model', 'dataset']{}['sklearn', 'model_performance', 'clustering']['clustering']
validmind.model_validation.sklearn.CompletenessScoreCompleteness ScoreEvaluates a clustering model's capacity to categorize instances from a single class into the same cluster....['model', 'dataset']{}['sklearn', 'model_performance', 'clustering']['clustering']
validmind.model_validation.sklearn.ConfusionMatrixConfusion MatrixEvaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix...['dataset', 'model']{'threshold': {'type': 'float', 'default': 0.5}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.FeatureImportanceFeature ImportanceCompute feature importance scores for a given model and generate a summary table...['dataset', 'model']{'num_features': {'type': 'int', 'default': 3}}['model_explainability', 'sklearn']['regression', 'time_series_forecasting']
validmind.model_validation.sklearn.FowlkesMallowsScoreFowlkes Mallows ScoreEvaluates the similarity between predicted and actual cluster assignments in a model using the Fowlkes-Mallows...['dataset', 'model']{}['sklearn', 'model_performance']['clustering']
validmind.model_validation.sklearn.HomogeneityScoreHomogeneity ScoreAssesses clustering homogeneity by comparing true and predicted labels, scoring from 0 (heterogeneous) to 1...['dataset', 'model']{}['sklearn', 'model_performance']['clustering']
validmind.model_validation.sklearn.HyperParametersTuningHyper Parameters TuningPerforms exhaustive grid search over specified parameter ranges to find optimal model configurations...['model', 'dataset']{'param_grid': {'type': 'dict', 'default': None}, 'scoring': {'type': None, 'default': None}, 'thresholds': {'type': None, 'default': None}, 'fit_params': {'type': 'dict', 'default': None}}['sklearn', 'model_performance']['clustering', 'classification']
validmind.model_validation.sklearn.KMeansClustersOptimizationK Means Clusters OptimizationOptimizes the number of clusters in K-means models using Elbow and Silhouette methods....['model', 'dataset']{'n_clusters': {'type': None, 'default': None}}['sklearn', 'model_performance', 'kmeans']['clustering']
validmind.model_validation.sklearn.MinimumAccuracyMinimum AccuracyChecks if the model's prediction accuracy meets or surpasses a specified threshold....['dataset', 'model']{'min_threshold': {'type': 'float', 'default': 0.7}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.MinimumF1ScoreMinimum F1 ScoreAssesses if the model's F1 score on the validation set meets a predefined minimum threshold, ensuring balanced...['dataset', 'model']{'min_threshold': {'type': 'float', 'default': 0.5}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.MinimumROCAUCScoreMinimum ROCAUC ScoreValidates model by checking if the ROC AUC score meets or surpasses a specified threshold....['dataset', 'model']{'min_threshold': {'type': 'float', 'default': 0.5}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.ModelParametersModel ParametersExtracts and displays model parameters in a structured format for transparency and reproducibility....['model']{'model_params': {'type': '_empty', 'default': None}}['model_training', 'metadata']['classification', 'regression']
validmind.model_validation.sklearn.ModelsPerformanceComparisonModels Performance ComparisonEvaluates and compares the performance of multiple Machine Learning models using various metrics like accuracy,...['dataset', 'models']{}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'model_comparison']['classification', 'text_classification']
validmind.model_validation.sklearn.OverfitDiagnosisOverfit DiagnosisAssesses potential overfitting in a model's predictions, identifying regions where performance between training and...['model', 'datasets']{'metric': {'type': 'str', 'default': None}, 'cut_off_threshold': {'type': 'float', 'default': 0.04}}['sklearn', 'binary_classification', 'multiclass_classification', 'linear_regression', 'model_diagnosis']['classification', 'regression']
validmind.model_validation.sklearn.PermutationFeatureImportancePermutation Feature ImportanceAssesses the significance of each feature in a model by evaluating the impact on model performance when feature...['model', 'dataset']{'fontsize': {'type': None, 'default': None}, 'figure_height': {'type': None, 'default': None}}['sklearn', 'binary_classification', 'multiclass_classification', 'feature_importance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.PopulationStabilityIndexPopulation Stability IndexAssesses the Population Stability Index (PSI) to quantify the stability of an ML model's predictions across...['datasets', 'model']{'num_bins': {'type': 'int', 'default': 10}, 'mode': {'type': 'str', 'default': 'fixed'}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.PrecisionRecallCurvePrecision Recall CurveEvaluates the precision-recall trade-off for binary classification models and visualizes the Precision-Recall curve....['model', 'dataset']{}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.ROCCurveROC CurveEvaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic...['model', 'dataset']{}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.RegressionErrorsRegression ErrorsAssesses the performance and error distribution of a regression model using various error metrics....['model', 'dataset']{}['sklearn', 'model_performance']['regression', 'classification']
validmind.model_validation.sklearn.RegressionErrorsComparisonRegression Errors ComparisonAssesses multiple regression error metrics to compare model performance across different datasets, emphasizing...['datasets', 'models']{}['model_performance', 'sklearn']['regression', 'time_series_forecasting']
validmind.model_validation.sklearn.RegressionPerformanceRegression PerformanceEvaluates the performance of a regression model using five different metrics: MAE, MSE, RMSE, MAPE, and MBD....['model', 'dataset']{}['sklearn', 'model_performance']['regression']
validmind.model_validation.sklearn.RegressionR2SquareRegression R2 SquareAssesses the overall goodness-of-fit of a regression model by evaluating R-squared (R2) and Adjusted R-squared (Adj...['dataset', 'model']{}['sklearn', 'model_performance']['regression']
validmind.model_validation.sklearn.RegressionR2SquareComparisonRegression R2 Square ComparisonCompares R-Squared and Adjusted R-Squared values for different regression models across multiple datasets to assess...['datasets', 'models']{}['model_performance', 'sklearn']['regression', 'time_series_forecasting']
validmind.model_validation.sklearn.RobustnessDiagnosisRobustness DiagnosisAssesses the robustness of a machine learning model by evaluating performance decay under noisy conditions....['datasets', 'model']{'metric': {'type': 'str', 'default': None}, 'scaling_factor_std_dev_list': {'type': None, 'default': [0.1, 0.2, 0.3, 0.4, 0.5]}, 'performance_decay_threshold': {'type': 'float', 'default': 0.05}}['sklearn', 'model_diagnosis', 'visualization']['classification', 'regression']
validmind.model_validation.sklearn.SHAPGlobalImportanceSHAP Global ImportanceEvaluates and visualizes global feature importance using SHAP values for model explanation and risk identification....['model', 'dataset']{'kernel_explainer_samples': {'type': 'int', 'default': 10}, 'tree_or_linear_explainer_samples': {'type': 'int', 'default': 200}, 'class_of_interest': {'type': None, 'default': None}}['sklearn', 'binary_classification', 'multiclass_classification', 'feature_importance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.ScoreProbabilityAlignmentScore Probability AlignmentAnalyzes the alignment between credit scores and predicted probabilities....['model', 'dataset']{'score_column': {'type': 'str', 'default': 'score'}, 'n_bins': {'type': 'int', 'default': 10}}['visualization', 'credit_risk', 'calibration']['classification']
validmind.model_validation.sklearn.SilhouettePlotSilhouette PlotCalculates and visualizes Silhouette Score, assessing the degree of data point suitability to its cluster in ML...['model', 'dataset']{}['sklearn', 'model_performance']['clustering']
validmind.model_validation.sklearn.TrainingTestDegradationTraining Test DegradationTests if model performance degradation between training and test datasets exceeds a predefined threshold....['datasets', 'model']{'max_threshold': {'type': 'float', 'default': 0.1}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.VMeasureV MeasureEvaluates homogeneity and completeness of a clustering model using the V Measure Score....['dataset', 'model']{}['sklearn', 'model_performance']['clustering']
validmind.model_validation.sklearn.WeakspotsDiagnosisWeakspots DiagnosisIdentifies and visualizes weak spots in a machine learning model's performance across various sections of the...['datasets', 'model']{'features_columns': {'type': None, 'default': None}, 'metrics': {'type': None, 'default': None}, 'thresholds': {'type': None, 'default': None}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_diagnosis', 'visualization']['classification', 'text_classification']
validmind.model_validation.statsmodels.AutoARIMAAuto ARIMAEvaluates ARIMA models for time-series forecasting, ranking them using Bayesian and Akaike Information Criteria....['model', 'dataset']{}['time_series_data', 'forecasting', 'model_selection', 'statsmodels']['regression']
validmind.model_validation.statsmodels.CumulativePredictionProbabilitiesCumulative Prediction ProbabilitiesVisualizes cumulative probabilities of positive and negative classes for both training and testing in classification models....['dataset', 'model']{'title': {'type': '_empty', 'default': 'Cumulative Probabilities'}}['visualization', 'credit_risk']['classification']
validmind.model_validation.statsmodels.DurbinWatsonTestDurbin Watson TestAssesses autocorrelation in time series data features using the Durbin-Watson statistic....['dataset', 'model']{'threshold': {'type': '_empty', 'default': [1.5, 2.5]}}['time_series_data', 'forecasting', 'statistical_test', 'statsmodels']['regression']
validmind.model_validation.statsmodels.GINITableGINI TableEvaluates classification model performance using AUC, GINI, and KS metrics for training and test datasets....['dataset', 'model']{}['model_performance']['classification']
validmind.model_validation.statsmodels.KolmogorovSmirnovKolmogorov SmirnovAssesses whether each feature in the dataset aligns with a normal distribution using the Kolmogorov-Smirnov test....['model', 'dataset']{'dist': {'type': 'str', 'default': 'norm'}}['tabular_data', 'data_distribution', 'statistical_test', 'statsmodels']['classification', 'regression']
validmind.model_validation.statsmodels.LillieforsLillieforsAssesses the normality of feature distributions in an ML model's training dataset using the Lilliefors test....['dataset']{}['tabular_data', 'data_distribution', 'statistical_test', 'statsmodels']['classification', 'regression']
validmind.model_validation.statsmodels.PredictionProbabilitiesHistogramPrediction Probabilities HistogramAssesses the predictive probability distribution for binary classification to evaluate model performance and...['dataset', 'model']{'title': {'type': '_empty', 'default': 'Histogram of Predictive Probabilities'}}['visualization', 'credit_risk']['classification']
validmind.model_validation.statsmodels.RegressionCoeffsRegression CoeffsAssesses the significance and uncertainty of predictor variables in a regression model through visualization of...['model']{}['tabular_data', 'visualization', 'model_training']['regression']
validmind.model_validation.statsmodels.RegressionFeatureSignificanceRegression Feature SignificanceAssesses and visualizes the statistical significance of features in a regression model....['model']{'fontsize': {'type': 'int', 'default': 10}, 'p_threshold': {'type': 'float', 'default': 0.05}}['statistical_test', 'model_interpretation', 'visualization', 'feature_importance']['regression']
validmind.model_validation.statsmodels.RegressionModelForecastPlotRegression Model Forecast PlotGenerates plots to visually compare the forecasted outcomes of a regression model against actual observed values over...['model', 'dataset']{'start_date': {'type': None, 'default': None}, 'end_date': {'type': None, 'default': None}}['time_series_data', 'forecasting', 'visualization']['regression']
validmind.model_validation.statsmodels.RegressionModelForecastPlotLevelsRegression Model Forecast Plot LevelsAssesses the alignment between forecasted and observed values in regression models through visual plots...['model', 'dataset']{}['time_series_data', 'forecasting', 'visualization']['regression']
validmind.model_validation.statsmodels.RegressionModelSensitivityPlotRegression Model Sensitivity PlotAssesses the sensitivity of a regression model to changes in independent variables by applying shocks and...['dataset', 'model']{'shocks': {'type': None, 'default': [0.1]}, 'transformation': {'type': None, 'default': None}}['senstivity_analysis', 'visualization']['regression']
validmind.model_validation.statsmodels.RegressionModelSummaryRegression Model SummaryEvaluates regression model performance using metrics including R-Squared, Adjusted R-Squared, MSE, and RMSE....['dataset', 'model']{}['model_performance', 'regression']['regression']
validmind.model_validation.statsmodels.RegressionPermutationFeatureImportanceRegression Permutation Feature ImportanceAssesses the significance of each feature in a model by evaluating the impact on model performance when feature...['dataset', 'model']{'fontsize': {'type': 'int', 'default': 12}, 'figure_height': {'type': 'int', 'default': 500}}['statsmodels', 'feature_importance', 'visualization']['regression']
validmind.model_validation.statsmodels.ScorecardHistogramScorecard HistogramThe Scorecard Histogram test evaluates the distribution of credit scores between default and non-default instances,...['dataset']{'title': {'type': '_empty', 'default': 'Histogram of Scores'}, 'score_column': {'type': '_empty', 'default': 'score'}}['visualization', 'credit_risk', 'logistic_regression']['classification']
validmind.ongoing_monitoring.CalibrationCurveDriftCalibration Curve DriftEvaluates changes in probability calibration between reference and monitoring datasets....['datasets', 'model']{'n_bins': {'type': 'int', 'default': 10}, 'drift_pct_threshold': {'type': 'float', 'default': 20}}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.ongoing_monitoring.ClassDiscriminationDriftClass Discrimination DriftCompares classification discrimination metrics between reference and monitoring datasets....['datasets', 'model']{'drift_pct_threshold': {'type': '_empty', 'default': 20}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.ongoing_monitoring.ClassImbalanceDriftClass Imbalance DriftEvaluates drift in class distribution between reference and monitoring datasets....['datasets']{'drift_pct_threshold': {'type': 'float', 'default': 5.0}, 'title': {'type': 'str', 'default': 'Class Distribution Drift'}}['tabular_data', 'binary_classification', 'multiclass_classification']['classification']
validmind.ongoing_monitoring.ClassificationAccuracyDriftClassification Accuracy DriftCompares classification accuracy metrics between reference and monitoring datasets....['datasets', 'model']{'drift_pct_threshold': {'type': '_empty', 'default': 20}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.ongoing_monitoring.ConfusionMatrixDriftConfusion Matrix DriftCompares confusion matrix metrics between reference and monitoring datasets....['datasets', 'model']{'drift_pct_threshold': {'type': '_empty', 'default': 20}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.ongoing_monitoring.CumulativePredictionProbabilitiesDriftCumulative Prediction Probabilities DriftCompares cumulative prediction probability distributions between reference and monitoring datasets....['datasets', 'model']{}['visualization', 'credit_risk']['classification']
validmind.ongoing_monitoring.FeatureDriftFeature DriftEvaluates changes in feature distribution over time to identify potential model drift....['datasets']{'bins': {'type': '_empty', 'default': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}, 'feature_columns': {'type': '_empty', 'default': None}, 'psi_threshold': {'type': '_empty', 'default': 0.2}}['visualization']['monitoring']
validmind.ongoing_monitoring.PredictionAcrossEachFeaturePrediction Across Each FeatureAssesses differences in model predictions across individual features between reference and monitoring datasets...['datasets', 'model']{}['visualization']['monitoring']
validmind.ongoing_monitoring.PredictionCorrelationPrediction CorrelationAssesses correlation changes between model predictions from reference and monitoring datasets to detect potential...['datasets', 'model']{'drift_pct_threshold': {'type': '_empty', 'default': 20}}['visualization']['monitoring']
validmind.ongoing_monitoring.PredictionProbabilitiesHistogramDriftPrediction Probabilities Histogram DriftCompares prediction probability distributions between reference and monitoring datasets....['datasets', 'model']{'title': {'type': '_empty', 'default': 'Prediction Probabilities Histogram Drift'}, 'drift_pct_threshold': {'type': 'float', 'default': 20.0}}['visualization', 'credit_risk']['classification']
validmind.ongoing_monitoring.PredictionQuantilesAcrossFeaturesPrediction Quantiles Across FeaturesAssesses differences in model prediction distributions across individual features between reference...['datasets', 'model']{}['visualization']['monitoring']
validmind.ongoing_monitoring.ROCCurveDriftROC Curve DriftCompares ROC curves between reference and monitoring datasets....['datasets', 'model']{}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.ongoing_monitoring.ScoreBandsDriftScore Bands DriftAnalyzes drift in population distribution and default rates across score bands....['datasets', 'model']{'score_column': {'type': 'str', 'default': 'score'}, 'score_bands': {'type': 'list', 'default': None}, 'drift_threshold': {'type': 'float', 'default': 20.0}}['visualization', 'credit_risk', 'scorecard']['classification']
validmind.ongoing_monitoring.ScorecardHistogramDriftScorecard Histogram DriftCompares score distributions between reference and monitoring datasets for each class....['datasets']{'score_column': {'type': 'str', 'default': 'score'}, 'title': {'type': 'str', 'default': 'Scorecard Histogram Drift'}, 'drift_pct_threshold': {'type': 'float', 'default': 20.0}}['visualization', 'credit_risk', 'logistic_regression']['classification']
validmind.ongoing_monitoring.TargetPredictionDistributionPlotTarget Prediction Distribution PlotAssesses differences in prediction distributions between a reference dataset and a monitoring dataset to identify...['datasets', 'model']{'drift_pct_threshold': {'type': '_empty', 'default': 20}}['visualization']['monitoring']
validmind.prompt_validation.BiasBiasAssesses potential bias in a Large Language Model by analyzing the distribution and order of exemplars in the...['model']{'min_threshold': {'type': '_empty', 'default': 7}}['llm', 'few_shot']['text_classification', 'text_summarization']
validmind.prompt_validation.ClarityClarityEvaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines....['model']{'min_threshold': {'type': '_empty', 'default': 7}}['llm', 'zero_shot', 'few_shot']['text_classification', 'text_summarization']
validmind.prompt_validation.ConcisenessConcisenessAnalyzes and grades the conciseness of prompts provided to a Large Language Model....['model']{'min_threshold': {'type': '_empty', 'default': 7}}['llm', 'zero_shot', 'few_shot']['text_classification', 'text_summarization']
validmind.prompt_validation.DelimitationDelimitationEvaluates the proper use of delimiters in prompts provided to Large Language Models....['model']{'min_threshold': {'type': '_empty', 'default': 7}}['llm', 'zero_shot', 'few_shot']['text_classification', 'text_summarization']
validmind.prompt_validation.NegativeInstructionNegative InstructionEvaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts....['model']{'min_threshold': {'type': '_empty', 'default': 7}}['llm', 'zero_shot', 'few_shot']['text_classification', 'text_summarization']
validmind.prompt_validation.RobustnessRobustnessAssesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts. This test...['model', 'dataset']{'num_tests': {'type': '_empty', 'default': 10}}['llm', 'zero_shot', 'few_shot']['text_classification', 'text_summarization']
validmind.prompt_validation.SpecificitySpecificityEvaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity, detail,...['model']{'min_threshold': {'type': '_empty', 'default': 7}}['llm', 'zero_shot', 'few_shot']['text_classification', 'text_summarization']
validmind.unit_metrics.classification.AccuracyAccuracyCalculates the accuracy of a model['dataset', 'model']{}['classification']['classification']
validmind.unit_metrics.classification.F1F1Calculates the F1 score for a classification model.['model', 'dataset']{}['classification']['classification']
validmind.unit_metrics.classification.PrecisionPrecisionCalculates the precision for a classification model.['model', 'dataset']{}['classification']['classification']
validmind.unit_metrics.classification.ROC_AUCROC AUCCalculates the ROC AUC for a classification model.['model', 'dataset']{}['classification']['classification']
validmind.unit_metrics.classification.RecallRecallCalculates the recall for a classification model.['model', 'dataset']{}['classification']['classification']
validmind.unit_metrics.regression.AdjustedRSquaredScoreAdjusted R Squared ScoreCalculates the adjusted R-squared score for a regression model.['model', 'dataset']{}['regression']['regression']
validmind.unit_metrics.regression.GiniCoefficientGini CoefficientCalculates the Gini coefficient for a regression model.['dataset', 'model']{}['regression']['regression']
validmind.unit_metrics.regression.HuberLossHuber LossCalculates the Huber loss for a regression model.['model', 'dataset']{}['regression']['regression']
validmind.unit_metrics.regression.KolmogorovSmirnovStatisticKolmogorov Smirnov StatisticCalculates the Kolmogorov-Smirnov statistic for a regression model.['dataset', 'model']{}['regression']['regression']
validmind.unit_metrics.regression.MeanAbsoluteErrorMean Absolute ErrorCalculates the mean absolute error for a regression model.['model', 'dataset']{}['regression']['regression']
validmind.unit_metrics.regression.MeanAbsolutePercentageErrorMean Absolute Percentage ErrorCalculates the mean absolute percentage error for a regression model.['model', 'dataset']{}['regression']['regression']
validmind.unit_metrics.regression.MeanBiasDeviationMean Bias DeviationCalculates the mean bias deviation for a regression model.['model', 'dataset']{}['regression']['regression']
validmind.unit_metrics.regression.MeanSquaredErrorMean Squared ErrorCalculates the mean squared error for a regression model.['model', 'dataset']{}['regression']['regression']
validmind.unit_metrics.regression.QuantileLossQuantile LossCalculates the quantile loss for a regression model.['model', 'dataset']{'quantile': {'type': '_empty', 'default': 0.5}}['regression']['regression']
validmind.unit_metrics.regression.RSquaredScoreR Squared ScoreCalculates the R-squared score for a regression model.['model', 'dataset']{}['regression']['regression']
validmind.unit_metrics.regression.RootMeanSquaredErrorRoot Mean Squared ErrorCalculates the root mean squared error for a regression model.['model', 'dataset']{}['regression']['regression']
\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 2, @@ -1317,18 +1912,20 @@ { "data": { "text/plain": [ - "['text_qa',\n", - " 'time_series_forecasting',\n", + "['time_series_forecasting',\n", + " 'feature_extraction',\n", + " 'text_qa',\n", " 'text_generation',\n", - " 'text_summarization',\n", - " 'nlp',\n", - " 'text_classification',\n", + " 'residual_analysis',\n", " 'visualization',\n", - " 'classification',\n", - " 'feature_extraction',\n", + " 'text_classification',\n", " 'regression',\n", - " 'residual_analysis',\n", - " 'clustering']" + " 'nlp',\n", + " 'text_summarization',\n", + " 'data_validation',\n", + " 'classification',\n", + " 'clustering',\n", + " 'monitoring']" ] }, "execution_count": 3, @@ -1348,57 +1945,66 @@ { "data": { "text/plain": [ - "['statsmodels',\n", - " 'anomaly_detection',\n", - " 'text_data',\n", - " 'data_quality',\n", + "['few_shot',\n", " 'ragas',\n", - " 'kmeans',\n", - " 'stationarity',\n", - " 'seasonality',\n", - " 'model_metadata',\n", - " 'zero_shot',\n", - " 'embeddings',\n", - " 'tabular_data',\n", - " 'qualitative',\n", - " 'forecasting',\n", - " 'correlation',\n", - " 'model_interpretation',\n", - " 'model_comparison',\n", - " 'feature_importance',\n", + " 'bias_and_fairness',\n", " 'AUC',\n", - " 'analysis',\n", - " 'time_series_data',\n", + " 'visualization',\n", " 'rag_performance',\n", - " 'text_embeddings',\n", + " 'logistic_regression',\n", + " 'model_validation',\n", + " 'credit_risk',\n", + " 'model_selection',\n", + " 'linear_regression',\n", + " 'clustering',\n", + " 'data_distribution',\n", " 'model_explainability',\n", - " 'data_validation',\n", + " 'frequency_analysis',\n", + " 'model_interpretation',\n", + " 'time_series_data',\n", + " 'forecasting',\n", + " 'llm',\n", " 'multiclass_classification',\n", + " 'data_validation',\n", " 'binary_classification',\n", - " 'nlp',\n", - " 'data_distribution',\n", - " 'sklearn',\n", - " 'visualization',\n", - " 'few_shot',\n", - " 'numerical_data',\n", - " 'model_predictions',\n", - " 'frequency_analysis',\n", - " 'model_performance',\n", + " 'stationarity',\n", " 'senstivity_analysis',\n", - " 'logistic_regression',\n", - " 'unit_root_test',\n", - " 'model_selection',\n", + " 'retrieval_performance',\n", + " 'categorical_data',\n", + " 'seasonality',\n", + " 'qualitative',\n", + " 'model_comparison',\n", + " 'model_training',\n", + " 'data_quality',\n", + " 'regression',\n", + " 'anomaly_detection',\n", + " 'calibration',\n", + " 'model_predictions',\n", " 'dimensionality_reduction',\n", + " 'descriptive_statistics',\n", + " 'classification',\n", + " 'unit_root_test',\n", " 'metadata',\n", - " 'llm',\n", - " 'statistical_test',\n", - " 'retrieval_performance',\n", - " 'model_training',\n", + " 'threshold_optimization',\n", " 'model_diagnosis',\n", - " 'categorical_data',\n", - " 'regression',\n", - " 'risk_analysis',\n", - " 'credit_risk']" + " 'feature_selection',\n", + " 'data_analysis',\n", + " 'statistical_test',\n", + " 'embeddings',\n", + " 'analysis',\n", + " 'feature_importance',\n", + " 'scorecard',\n", + " 'correlation',\n", + " 'classification_metrics',\n", + " 'nlp',\n", + " 'sklearn',\n", + " 'kmeans',\n", + " 'statsmodels',\n", + " 'numerical_data',\n", + " 'zero_shot',\n", + " 'text_data',\n", + " 'tabular_data',\n", + " 'model_performance']" ] }, "execution_count": 4, @@ -1426,74 +2032,82 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
TaskTagsTaskTags
text_classificationtext_data, ragas, model_metadata, zero_shot, tabular_data, model_comparison, feature_importance, time_series_data, multiclass_classification, binary_classification, nlp, sklearn, visualization, few_shot, frequency_analysis, model_performance, llm, retrieval_performance, model_diagnosisregressionbias_and_fairness, visualization, model_selection, linear_regression, data_distribution, model_explainability, model_interpretation, time_series_data, forecasting, multiclass_classification, data_validation, binary_classification, stationarity, model_performance, senstivity_analysis, categorical_data, seasonality, data_quality, regression, model_predictions, descriptive_statistics, unit_root_test, metadata, model_diagnosis, feature_selection, data_analysis, statistical_test, analysis, feature_importance, correlation, sklearn, statsmodels, numerical_data, text_data, tabular_data, model_training
classificationbias_and_fairness, AUC, visualization, logistic_regression, model_validation, credit_risk, linear_regression, data_distribution, time_series_data, multiclass_classification, binary_classification, categorical_data, model_comparison, model_training, data_quality, anomaly_detection, calibration, descriptive_statistics, classification, metadata, model_diagnosis, threshold_optimization, feature_selection, data_analysis, statistical_test, classification_metrics, feature_importance, scorecard, correlation, sklearn, statsmodels, numerical_data, text_data, tabular_data, model_performance
text_summarizationtime_series_data, rag_performance, dimensionality_reduction, text_data, qualitative, ragas, nlp, llm, model_metadata, visualization, few_shot, retrieval_performance, zero_shot, frequency_analysis, embeddings, tabular_datatext_classificationfew_shot, ragas, visualization, frequency_analysis, model_comparison, feature_importance, time_series_data, nlp, llm, sklearn, multiclass_classification, zero_shot, text_data, binary_classification, retrieval_performance, tabular_data, model_performance, model_diagnosis
residual_analysisregressiontext_summarizationfew_shot, ragas, qualitative, visualization, frequency_analysis, embeddings, rag_performance, time_series_data, nlp, llm, zero_shot, text_data, dimensionality_reduction, retrieval_performance, tabular_data
visualizationregressiondata_validationstationarity, time_series_data, statsmodels, unit_root_test
regressionstatsmodels, text_data, data_quality, stationarity, seasonality, model_metadata, tabular_data, forecasting, correlation, model_interpretation, model_comparison, feature_importance, analysis, time_series_data, model_explainability, data_validation, data_distribution, sklearn, visualization, numerical_data, model_predictions, model_performance, senstivity_analysis, unit_root_test, model_selection, metadata, statistical_test, model_training, categorical_data, risk_analysistime_series_forecastingmodel_explainability, visualization, time_series_data, sklearn, model_predictions, data_validation, model_performance, model_training, metadata
time_series_forecastingmodel_explainability, metadata, data_validation, sklearn, visualization, model_training, model_predictions, model_performancenlpvisualization, frequency_analysis, data_validation, nlp, text_data
classificationstatsmodels, anomaly_detection, text_data, data_quality, model_metadata, tabular_data, correlation, model_comparison, feature_importance, AUC, time_series_data, multiclass_classification, binary_classification, data_distribution, sklearn, visualization, numerical_data, model_performance, logistic_regression, statistical_test, model_diagnosis, categorical_data, risk_analysis, credit_riskclusteringsklearn, kmeans, clustering, model_performance
clusteringsklearn, model_performance, kmeansresidual_analysisregression
text_qarag_performance, dimensionality_reduction, qualitative, ragas, llm, visualization, retrieval_performance, embeddingsvisualizationregression
text_generationrag_performance, dimensionality_reduction, qualitative, ragas, llm, visualization, retrieval_performance, embeddingsfeature_extractiontext_data, llm, visualization, embeddings
feature_extractionllm, text_embeddings, visualization, text_datatext_qaragas, qualitative, visualization, embeddings, rag_performance, llm, dimensionality_reduction, retrieval_performance
nlpdata_validation, nlp, text_datatext_generationragas, qualitative, visualization, embeddings, rag_performance, llm, dimensionality_reduction, retrieval_performance
monitoringvisualization
\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 5, @@ -1532,274 +2146,418 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
IDNameDescriptionRequired InputsParamsIDNameDescriptionRequired InputsParamsTagsTasks
validmind.model_validation.ClusterSizeDistributionCluster Size DistributionCompares and visualizes the distribution of cluster sizes in model predictions and actual data for assessing...['model', 'dataset']None
validmind.model_validation.TimeSeriesR2SquareBySegmentsTime Series R2 Square By SegmentsPlot R-Squared values for each model over specified time segments and generate a bar chart...['datasets', 'models']{'segments': None}
validmind.model_validation.sklearn.RegressionModelsPerformanceComparisonRegression Models Performance ComparisonCompares and evaluates the performance of multiple regression models using five different metrics: MAE, MSE, RMSE,...['dataset', 'models']None
validmind.model_validation.sklearn.AdjustedMutualInformationAdjusted Mutual InformationEvaluates clustering model performance by measuring mutual information between true and predicted labels, adjusting...['model', 'datasets']None
validmind.model_validation.sklearn.SilhouettePlotSilhouette PlotCalculates and visualizes Silhouette Score, assessing degree of data point suitability to its cluster in ML models....['model', 'dataset']None
validmind.model_validation.sklearn.RobustnessDiagnosisRobustness DiagnosisEvaluates the robustness of a machine learning model by injecting Gaussian noise to input data and measuring...['model', 'datasets']{'features_columns': None, 'scaling_factor_std_dev_list': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], 'accuracy_decay_threshold': 4}
validmind.model_validation.sklearn.AdjustedRandIndexAdjusted Rand IndexMeasures the similarity between two data clusters using the Adjusted Rand Index (ARI) metric in clustering machine...['model', 'datasets']None
validmind.model_validation.sklearn.SHAPGlobalImportanceSHAP Global ImportanceEvaluates and visualizes global feature importance using SHAP values for model explanation and risk identification....['model', 'dataset']{'kernel_explainer_samples': 10, 'tree_or_linear_explainer_samples': 200}
validmind.model_validation.sklearn.ConfusionMatrixConfusion MatrixEvaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix...['model', 'dataset']None
validmind.model_validation.sklearn.HomogeneityScoreHomogeneity ScoreAssesses clustering homogeneity by comparing true and predicted labels, scoring from 0 (heterogeneous) to 1...['model', 'datasets']None
validmind.model_validation.sklearn.CompletenessScoreCompleteness ScoreEvaluates a clustering model's capacity to categorize instances from a single class into the same cluster....['model', 'datasets']None
validmind.model_validation.sklearn.OverfitDiagnosisOverfit DiagnosisDetects and visualizes overfit regions in an ML model by comparing performance on training and test datasets....['model', 'datasets']{'features_columns': None, 'cut_off_percentage': 4}
validmind.model_validation.sklearn.ClusterPerformanceMetricsCluster Performance MetricsEvaluates the performance of clustering machine learning models using multiple established metrics....['model', 'datasets']None
validmind.model_validation.sklearn.PermutationFeatureImportancePermutation Feature ImportanceAssesses the significance of each feature in a model by evaluating the impact on model performance when feature...['model', 'dataset']{'fontsize': None, 'figure_height': 1000}
validmind.model_validation.sklearn.FowlkesMallowsScoreFowlkes Mallows ScoreEvaluates the similarity between predicted and actual cluster assignments in a model using the Fowlkes-Mallows...['model', 'datasets']None
validmind.model_validation.sklearn.MinimumROCAUCScoreMinimum ROCAUC ScoreValidates model by checking if the ROC AUC score meets or surpasses a specified threshold....['model', 'dataset']{'min_threshold': 0.5}
validmind.model_validation.sklearn.ClusterCosineSimilarityCluster Cosine SimilarityMeasures the intra-cluster similarity of a clustering model using cosine similarity....['model', 'dataset']None
validmind.model_validation.sklearn.PrecisionRecallCurvePrecision Recall CurveEvaluates the precision-recall trade-off for binary classification models and visualizes the Precision-Recall curve....['model', 'dataset']None
validmind.model_validation.sklearn.ClassifierPerformanceClassifier PerformanceEvaluates performance of binary or multiclass classification models using precision, recall, F1-Score, accuracy,...['model', 'dataset']None
validmind.model_validation.sklearn.VMeasureV MeasureEvaluates homogeneity and completeness of a clustering model using the V Measure Score....['model', 'datasets']None
validmind.model_validation.sklearn.MinimumF1ScoreMinimum F1 ScoreEvaluates if the model's F1 score on the validation set meets a predefined minimum threshold....['model', 'dataset']{'min_threshold': 0.5}
validmind.model_validation.sklearn.ROCCurveROC CurveEvaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic...['model', 'dataset']None
validmind.model_validation.sklearn.RegressionR2SquareRegression R2 Square**Purpose**: The purpose of the RegressionR2Square Metric test is to measure the overall goodness-of-fit of a...['model', 'datasets']None
validmind.model_validation.sklearn.RegressionErrorsRegression Errors**Purpose**: This metric is used to measure the performance of a regression model. It gauges the model's accuracy...['model', 'datasets']None
validmind.model_validation.sklearn.ClusterPerformanceCluster PerformanceEvaluates and compares a clustering model's performance on training and testing datasets using multiple defined...['model', 'datasets']None
validmind.model_validation.sklearn.FeatureImportanceComparisonFeature Importance ComparisonCompare feature importance scores for each model and generate a summary table...['datasets', 'models']{'num_features': 3}
validmind.model_validation.sklearn.TrainingTestDegradationTraining Test DegradationTests if model performance degradation between training and test datasets exceeds a predefined threshold....['model', 'datasets']{'metrics': ['accuracy', 'precision', 'recall', 'f1'], 'max_threshold': 0.1}
validmind.model_validation.sklearn.RegressionErrorsComparisonRegression Errors ComparisonCompare regression error metrics for each model and generate a summary table...['datasets', 'models']{}
validmind.model_validation.sklearn.HyperParametersTuningHyper Parameters TuningExerts exhaustive grid search to identify optimal hyperparameters for the model, improving performance....['model', 'dataset']{'param_grid': None, 'scoring': None}
validmind.model_validation.sklearn.KMeansClustersOptimizationK Means Clusters OptimizationOptimizes the number of clusters in K-means models using Elbow and Silhouette methods....['model', 'dataset']{'n_clusters': None}
validmind.model_validation.sklearn.ModelsPerformanceComparisonModels Performance ComparisonEvaluates and compares the performance of multiple Machine Learning models using various metrics like accuracy,...['dataset', 'models']None
validmind.model_validation.sklearn.WeakspotsDiagnosisWeakspots DiagnosisIdentifies and visualizes weak spots in a machine learning model's performance across various sections of the...['model', 'datasets']{'features_columns': None, 'thresholds': {'accuracy': 0.75, 'precision': 0.5, 'recall': 0.5, 'f1': 0.7}}
validmind.model_validation.sklearn.RegressionR2SquareComparisonRegression R2 Square ComparisonCompare R-Squared and Adjusted R-Squared values for each model and generate a summary table...['datasets', 'models']{}
validmind.model_validation.sklearn.PopulationStabilityIndexPopulation Stability IndexEvaluates the Population Stability Index (PSI) to quantify the stability of an ML model's predictions across...['model', 'datasets']{'num_bins': 10, 'mode': 'fixed'}
validmind.model_validation.sklearn.MinimumAccuracyMinimum AccuracyChecks if the model's prediction accuracy meets or surpasses a specified threshold....['model', 'dataset']{'min_threshold': 0.7}validmind.model_validation.ClusterSizeDistributionCluster Size DistributionAssesses the performance of clustering models by comparing the distribution of cluster sizes in model predictions...['dataset', 'model']{}['sklearn', 'model_performance']['clustering']
validmind.model_validation.TimeSeriesR2SquareBySegmentsTime Series R2 Square By SegmentsEvaluates the R-Squared values of regression models over specified time segments in time series data to assess...['dataset', 'model']{'segments': {'type': '_empty', 'default': None}}['model_performance', 'sklearn']['regression', 'time_series_forecasting']
validmind.model_validation.sklearn.AdjustedMutualInformationAdjusted Mutual InformationEvaluates clustering model performance by measuring mutual information between true and predicted labels, adjusting...['model', 'dataset']{}['sklearn', 'model_performance', 'clustering']['clustering']
validmind.model_validation.sklearn.AdjustedRandIndexAdjusted Rand IndexMeasures the similarity between two data clusters using the Adjusted Rand Index (ARI) metric in clustering machine...['model', 'dataset']{}['sklearn', 'model_performance', 'clustering']['clustering']
validmind.model_validation.sklearn.CalibrationCurveCalibration CurveEvaluates the calibration of probability estimates by comparing predicted probabilities against observed...['model', 'dataset']{'n_bins': {'type': 'int', 'default': 10}}['sklearn', 'model_performance', 'classification']['classification']
validmind.model_validation.sklearn.ClassifierPerformanceClassifier PerformanceEvaluates performance of binary or multiclass classification models using precision, recall, F1-Score, accuracy,...['dataset', 'model']{'average': {'type': 'str', 'default': 'macro'}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.ClassifierThresholdOptimizationClassifier Threshold OptimizationAnalyzes and visualizes different threshold optimization methods for binary classification models....['dataset', 'model']{'methods': {'type': None, 'default': None}, 'target_recall': {'type': None, 'default': None}}['model_validation', 'threshold_optimization', 'classification_metrics']['classification']
validmind.model_validation.sklearn.ClusterCosineSimilarityCluster Cosine SimilarityMeasures the intra-cluster similarity of a clustering model using cosine similarity....['model', 'dataset']{}['sklearn', 'model_performance', 'clustering']['clustering']
validmind.model_validation.sklearn.ClusterPerformanceMetricsCluster Performance MetricsEvaluates the performance of clustering machine learning models using multiple established metrics....['model', 'dataset']{}['sklearn', 'model_performance', 'clustering']['clustering']
validmind.model_validation.sklearn.CompletenessScoreCompleteness ScoreEvaluates a clustering model's capacity to categorize instances from a single class into the same cluster....['model', 'dataset']{}['sklearn', 'model_performance', 'clustering']['clustering']
validmind.model_validation.sklearn.ConfusionMatrixConfusion MatrixEvaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix...['dataset', 'model']{'threshold': {'type': 'float', 'default': 0.5}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.FeatureImportanceFeature ImportanceCompute feature importance scores for a given model and generate a summary table...['dataset', 'model']{'num_features': {'type': 'int', 'default': 3}}['model_explainability', 'sklearn']['regression', 'time_series_forecasting']
validmind.model_validation.sklearn.FowlkesMallowsScoreFowlkes Mallows ScoreEvaluates the similarity between predicted and actual cluster assignments in a model using the Fowlkes-Mallows...['dataset', 'model']{}['sklearn', 'model_performance']['clustering']
validmind.model_validation.sklearn.HomogeneityScoreHomogeneity ScoreAssesses clustering homogeneity by comparing true and predicted labels, scoring from 0 (heterogeneous) to 1...['dataset', 'model']{}['sklearn', 'model_performance']['clustering']
validmind.model_validation.sklearn.HyperParametersTuningHyper Parameters TuningPerforms exhaustive grid search over specified parameter ranges to find optimal model configurations...['model', 'dataset']{'param_grid': {'type': 'dict', 'default': None}, 'scoring': {'type': None, 'default': None}, 'thresholds': {'type': None, 'default': None}, 'fit_params': {'type': 'dict', 'default': None}}['sklearn', 'model_performance']['clustering', 'classification']
validmind.model_validation.sklearn.KMeansClustersOptimizationK Means Clusters OptimizationOptimizes the number of clusters in K-means models using Elbow and Silhouette methods....['model', 'dataset']{'n_clusters': {'type': None, 'default': None}}['sklearn', 'model_performance', 'kmeans']['clustering']
validmind.model_validation.sklearn.MinimumAccuracyMinimum AccuracyChecks if the model's prediction accuracy meets or surpasses a specified threshold....['dataset', 'model']{'min_threshold': {'type': 'float', 'default': 0.7}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.MinimumF1ScoreMinimum F1 ScoreAssesses if the model's F1 score on the validation set meets a predefined minimum threshold, ensuring balanced...['dataset', 'model']{'min_threshold': {'type': 'float', 'default': 0.5}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.MinimumROCAUCScoreMinimum ROCAUC ScoreValidates model by checking if the ROC AUC score meets or surpasses a specified threshold....['dataset', 'model']{'min_threshold': {'type': 'float', 'default': 0.5}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.ModelParametersModel ParametersExtracts and displays model parameters in a structured format for transparency and reproducibility....['model']{'model_params': {'type': '_empty', 'default': None}}['model_training', 'metadata']['classification', 'regression']
validmind.model_validation.sklearn.ModelsPerformanceComparisonModels Performance ComparisonEvaluates and compares the performance of multiple Machine Learning models using various metrics like accuracy,...['dataset', 'models']{}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'model_comparison']['classification', 'text_classification']
validmind.model_validation.sklearn.OverfitDiagnosisOverfit DiagnosisAssesses potential overfitting in a model's predictions, identifying regions where performance between training and...['model', 'datasets']{'metric': {'type': 'str', 'default': None}, 'cut_off_threshold': {'type': 'float', 'default': 0.04}}['sklearn', 'binary_classification', 'multiclass_classification', 'linear_regression', 'model_diagnosis']['classification', 'regression']
validmind.model_validation.sklearn.PermutationFeatureImportancePermutation Feature ImportanceAssesses the significance of each feature in a model by evaluating the impact on model performance when feature...['model', 'dataset']{'fontsize': {'type': None, 'default': None}, 'figure_height': {'type': None, 'default': None}}['sklearn', 'binary_classification', 'multiclass_classification', 'feature_importance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.PopulationStabilityIndexPopulation Stability IndexAssesses the Population Stability Index (PSI) to quantify the stability of an ML model's predictions across...['datasets', 'model']{'num_bins': {'type': 'int', 'default': 10}, 'mode': {'type': 'str', 'default': 'fixed'}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.PrecisionRecallCurvePrecision Recall CurveEvaluates the precision-recall trade-off for binary classification models and visualizes the Precision-Recall curve....['model', 'dataset']{}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.ROCCurveROC CurveEvaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic...['model', 'dataset']{}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.RegressionErrorsRegression ErrorsAssesses the performance and error distribution of a regression model using various error metrics....['model', 'dataset']{}['sklearn', 'model_performance']['regression', 'classification']
validmind.model_validation.sklearn.RegressionErrorsComparisonRegression Errors ComparisonAssesses multiple regression error metrics to compare model performance across different datasets, emphasizing...['datasets', 'models']{}['model_performance', 'sklearn']['regression', 'time_series_forecasting']
validmind.model_validation.sklearn.RegressionPerformanceRegression PerformanceEvaluates the performance of a regression model using five different metrics: MAE, MSE, RMSE, MAPE, and MBD....['model', 'dataset']{}['sklearn', 'model_performance']['regression']
validmind.model_validation.sklearn.RegressionR2SquareRegression R2 SquareAssesses the overall goodness-of-fit of a regression model by evaluating R-squared (R2) and Adjusted R-squared (Adj...['dataset', 'model']{}['sklearn', 'model_performance']['regression']
validmind.model_validation.sklearn.RegressionR2SquareComparisonRegression R2 Square ComparisonCompares R-Squared and Adjusted R-Squared values for different regression models across multiple datasets to assess...['datasets', 'models']{}['model_performance', 'sklearn']['regression', 'time_series_forecasting']
validmind.model_validation.sklearn.RobustnessDiagnosisRobustness DiagnosisAssesses the robustness of a machine learning model by evaluating performance decay under noisy conditions....['datasets', 'model']{'metric': {'type': 'str', 'default': None}, 'scaling_factor_std_dev_list': {'type': None, 'default': [0.1, 0.2, 0.3, 0.4, 0.5]}, 'performance_decay_threshold': {'type': 'float', 'default': 0.05}}['sklearn', 'model_diagnosis', 'visualization']['classification', 'regression']
validmind.model_validation.sklearn.SHAPGlobalImportanceSHAP Global ImportanceEvaluates and visualizes global feature importance using SHAP values for model explanation and risk identification....['model', 'dataset']{'kernel_explainer_samples': {'type': 'int', 'default': 10}, 'tree_or_linear_explainer_samples': {'type': 'int', 'default': 200}, 'class_of_interest': {'type': None, 'default': None}}['sklearn', 'binary_classification', 'multiclass_classification', 'feature_importance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.ScoreProbabilityAlignmentScore Probability AlignmentAnalyzes the alignment between credit scores and predicted probabilities....['model', 'dataset']{'score_column': {'type': 'str', 'default': 'score'}, 'n_bins': {'type': 'int', 'default': 10}}['visualization', 'credit_risk', 'calibration']['classification']
validmind.model_validation.sklearn.SilhouettePlotSilhouette PlotCalculates and visualizes Silhouette Score, assessing the degree of data point suitability to its cluster in ML...['model', 'dataset']{}['sklearn', 'model_performance']['clustering']
validmind.model_validation.sklearn.TrainingTestDegradationTraining Test DegradationTests if model performance degradation between training and test datasets exceeds a predefined threshold....['datasets', 'model']{'max_threshold': {'type': 'float', 'default': 0.1}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.VMeasureV MeasureEvaluates homogeneity and completeness of a clustering model using the V Measure Score....['dataset', 'model']{}['sklearn', 'model_performance']['clustering']
validmind.model_validation.sklearn.WeakspotsDiagnosisWeakspots DiagnosisIdentifies and visualizes weak spots in a machine learning model's performance across various sections of the...['datasets', 'model']{'features_columns': {'type': None, 'default': None}, 'metrics': {'type': None, 'default': None}, 'thresholds': {'type': None, 'default': None}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_diagnosis', 'visualization']['classification', 'text_classification']
validmind.ongoing_monitoring.CalibrationCurveDriftCalibration Curve DriftEvaluates changes in probability calibration between reference and monitoring datasets....['datasets', 'model']{'n_bins': {'type': 'int', 'default': 10}, 'drift_pct_threshold': {'type': 'float', 'default': 20}}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.ongoing_monitoring.ClassDiscriminationDriftClass Discrimination DriftCompares classification discrimination metrics between reference and monitoring datasets....['datasets', 'model']{'drift_pct_threshold': {'type': '_empty', 'default': 20}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.ongoing_monitoring.ClassificationAccuracyDriftClassification Accuracy DriftCompares classification accuracy metrics between reference and monitoring datasets....['datasets', 'model']{'drift_pct_threshold': {'type': '_empty', 'default': 20}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.ongoing_monitoring.ConfusionMatrixDriftConfusion Matrix DriftCompares confusion matrix metrics between reference and monitoring datasets....['datasets', 'model']{'drift_pct_threshold': {'type': '_empty', 'default': 20}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.ongoing_monitoring.ROCCurveDriftROC Curve DriftCompares ROC curves between reference and monitoring datasets....['datasets', 'model']{}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 6, @@ -1827,442 +2585,715 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
IDNameDescriptionRequired InputsParamsIDNameDescriptionRequired InputsParamsTagsTasks
validmind.model_validation.FeaturesAUCFeatures AUCEvaluates the discriminatory power of each individual feature within a binary classification model by calculating the Area Under the Curve (AUC) for each feature separately....['model', 'dataset']{'fontsize': 12, 'figure_height': 500}
validmind.model_validation.ModelMetadataModel MetadataExtracts and summarizes critical metadata from a machine learning model instance for comprehensive analysis....['model']None
validmind.model_validation.sklearn.RobustnessDiagnosisRobustness DiagnosisEvaluates the robustness of a machine learning model by injecting Gaussian noise to input data and measuring...['model', 'datasets']{'features_columns': None, 'scaling_factor_std_dev_list': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], 'accuracy_decay_threshold': 4}
validmind.model_validation.sklearn.SHAPGlobalImportanceSHAP Global ImportanceEvaluates and visualizes global feature importance using SHAP values for model explanation and risk identification....['model', 'dataset']{'kernel_explainer_samples': 10, 'tree_or_linear_explainer_samples': 200}
validmind.model_validation.sklearn.ConfusionMatrixConfusion MatrixEvaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix...['model', 'dataset']None
validmind.model_validation.sklearn.OverfitDiagnosisOverfit DiagnosisDetects and visualizes overfit regions in an ML model by comparing performance on training and test datasets....['model', 'datasets']{'features_columns': None, 'cut_off_percentage': 4}
validmind.model_validation.sklearn.PermutationFeatureImportancePermutation Feature ImportanceAssesses the significance of each feature in a model by evaluating the impact on model performance when feature...['model', 'dataset']{'fontsize': None, 'figure_height': 1000}
validmind.model_validation.sklearn.MinimumROCAUCScoreMinimum ROCAUC ScoreValidates model by checking if the ROC AUC score meets or surpasses a specified threshold....['model', 'dataset']{'min_threshold': 0.5}
validmind.model_validation.sklearn.PrecisionRecallCurvePrecision Recall CurveEvaluates the precision-recall trade-off for binary classification models and visualizes the Precision-Recall curve....['model', 'dataset']None
validmind.model_validation.sklearn.ClassifierPerformanceClassifier PerformanceEvaluates performance of binary or multiclass classification models using precision, recall, F1-Score, accuracy,...['model', 'dataset']None
validmind.model_validation.sklearn.MinimumF1ScoreMinimum F1 ScoreEvaluates if the model's F1 score on the validation set meets a predefined minimum threshold....['model', 'dataset']{'min_threshold': 0.5}
validmind.model_validation.sklearn.ROCCurveROC CurveEvaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic...['model', 'dataset']None
validmind.model_validation.sklearn.TrainingTestDegradationTraining Test DegradationTests if model performance degradation between training and test datasets exceeds a predefined threshold....['model', 'datasets']{'metrics': ['accuracy', 'precision', 'recall', 'f1'], 'max_threshold': 0.1}
validmind.model_validation.sklearn.HyperParametersTuningHyper Parameters TuningExerts exhaustive grid search to identify optimal hyperparameters for the model, improving performance....['model', 'dataset']{'param_grid': None, 'scoring': None}
validmind.model_validation.sklearn.ModelsPerformanceComparisonModels Performance ComparisonEvaluates and compares the performance of multiple Machine Learning models using various metrics like accuracy,...['dataset', 'models']None
validmind.model_validation.sklearn.WeakspotsDiagnosisWeakspots DiagnosisIdentifies and visualizes weak spots in a machine learning model's performance across various sections of the...['model', 'datasets']{'features_columns': None, 'thresholds': {'accuracy': 0.75, 'precision': 0.5, 'recall': 0.5, 'f1': 0.7}}
validmind.model_validation.sklearn.PopulationStabilityIndexPopulation Stability IndexEvaluates the Population Stability Index (PSI) to quantify the stability of an ML model's predictions across...['model', 'datasets']{'num_bins': 10, 'mode': 'fixed'}
validmind.model_validation.sklearn.MinimumAccuracyMinimum AccuracyChecks if the model's prediction accuracy meets or surpasses a specified threshold....['model', 'dataset']{'min_threshold': 0.7}
validmind.model_validation.statsmodels.ScorecardHistogramScorecard HistogramCreates histograms of credit scores, from both default and non-default instances, generated by a credit-risk model....['datasets']{'title': 'Histogram of Scores', 'score_column': 'score'}
validmind.model_validation.statsmodels.JarqueBeraJarque BeraAssesses normality of dataset features in an ML model using the Jarque-Bera test....['dataset']None
validmind.model_validation.statsmodels.KolmogorovSmirnovKolmogorov SmirnovExecutes a feature-wise Kolmogorov-Smirnov test to evaluate alignment with normal distribution in datasets....['dataset']{'dist': 'norm'}
validmind.model_validation.statsmodels.ShapiroWilkShapiro WilkEvaluates feature-wise normality of training data using the Shapiro-Wilk test....['dataset']None
validmind.model_validation.statsmodels.CumulativePredictionProbabilitiesCumulative Prediction ProbabilitiesVisualizes cumulative probabilities of positive and negative classes for both training and testing in logistic...['model', 'datasets']{'title': 'Cumulative Probabilities'}
validmind.model_validation.statsmodels.LillieforsLillieforsAssesses the normality of feature distributions in an ML model's training dataset using the Lilliefors test....['dataset']None
validmind.model_validation.statsmodels.RunsTestRuns TestExecutes Runs Test on ML model to detect non-random patterns in output data sequence....['dataset']None
validmind.model_validation.statsmodels.PredictionProbabilitiesHistogramPrediction Probabilities HistogramGenerates and visualizes histograms of the Probability of Default predictions for both positive and negative...['model', 'datasets']{'title': 'Histogram of Predictive Probabilities'}
validmind.model_validation.statsmodels.GINITableGINI TableEvaluates classification model performance using AUC, GINI, and KS metrics for training and test datasets....['model', 'datasets']None
validmind.data_validation.MissingValuesRiskMissing Values RiskAssesses and quantifies the risk related to missing values in a dataset used for training an ML model....['dataset']None
validmind.data_validation.IQROutliersTableIQR Outliers TableDetermines and summarizes outliers in numerical features using Interquartile Range method....['dataset']{'features': None, 'threshold': 1.5}
validmind.data_validation.BivariateFeaturesBarPlotsBivariate Features Bar PlotsGenerates visual bar plots to analyze the relationship between paired features within categorical data in the model....['dataset']{'features_pairs': None}
validmind.data_validation.SkewnessSkewnessEvaluates the skewness of numerical data in a machine learning model and checks if it falls below a set maximum...['dataset']{'max_threshold': 1}
validmind.data_validation.DuplicatesDuplicatesTests dataset for duplicate entries, ensuring model reliability via data quality verification....['dataset']{'min_threshold': 1}
validmind.data_validation.MissingValuesBarPlotMissing Values Bar PlotCreates a bar plot showcasing the percentage of missing values in each column of the dataset with risk...['dataset']{'threshold': 80, 'fig_height': 600}
validmind.data_validation.DatasetDescriptionDataset DescriptionProvides comprehensive analysis and statistical summaries of each field in a machine learning model's dataset....['dataset']None
validmind.data_validation.ScatterPlotScatter PlotCreates a scatter plot matrix to visually analyze feature relationships, patterns, and outliers in a dataset....['dataset']None
validmind.data_validation.TabularCategoricalBarPlotsTabular Categorical Bar PlotsGenerates and visualizes bar plots for each category in categorical features to evaluate dataset's composition....['dataset']None
validmind.data_validation.DescriptiveStatisticsDescriptive StatisticsPerforms a detailed descriptive statistical analysis of both numerical and categorical data within a model's...['dataset']None
validmind.data_validation.ANOVAOneWayTableANOVA One Way TableApplies one-way ANOVA (Analysis of Variance) to identify statistically significant numerical features in the...['dataset']{'features': None, 'p_threshold': 0.05}
validmind.data_validation.TargetRateBarPlotsTarget Rate Bar PlotsGenerates bar plots visualizing the default rates of categorical features for a classification machine learning...['dataset']{'default_column': None, 'columns': None}
validmind.data_validation.PearsonCorrelationMatrixPearson Correlation MatrixEvaluates linear dependency between numerical variables in a dataset via a Pearson Correlation coefficient heat map....['dataset']None
validmind.data_validation.FeatureTargetCorrelationPlotFeature Target Correlation PlotVisualizes the correlation between input features and model's target output in a color-coded horizontal bar plot....['dataset']{'features': None, 'fig_height': 600}
validmind.data_validation.TabularNumericalHistogramsTabular Numerical HistogramsGenerates histograms for each numerical feature in a dataset to provide visual insights into data distribution and...['dataset']None
validmind.data_validation.IsolationForestOutliersIsolation Forest OutliersDetects outliers in a dataset using the Isolation Forest algorithm and visualizes results through scatter plots....['dataset']{'random_state': 0, 'contamination': 0.1, 'features_columns': None}
validmind.data_validation.ChiSquaredFeaturesTableChi Squared Features TableExecutes Chi-Squared test for each categorical feature against a target column to assess significant association....['dataset']{'cat_features': None, 'p_threshold': 0.05}
validmind.data_validation.HighCardinalityHigh CardinalityAssesses the number of unique values in categorical columns to detect high cardinality and potential overfitting....['dataset']{'num_threshold': 100, 'percent_threshold': 0.1, 'threshold_type': 'percent'}
validmind.data_validation.MissingValuesMissing ValuesEvaluates dataset quality by ensuring missing value ratio across all features does not exceed a set threshold....['dataset']{'min_threshold': 1}
validmind.data_validation.TabularDescriptionTablesTabular Description TablesSummarizes key descriptive statistics for numerical, categorical, and datetime variables in a dataset....['dataset']None
validmind.data_validation.UniqueRowsUnique RowsVerifies the diversity of the dataset by ensuring that the count of unique rows exceeds a prescribed threshold....['dataset']{'min_percent_threshold': 1}
validmind.data_validation.TooManyZeroValuesToo Many Zero ValuesIdentifies numerical columns in a dataset that contain an excessive number of zero values, defined by a threshold...['dataset']{'max_percent_threshold': 0.03}
validmind.data_validation.HighPearsonCorrelationHigh Pearson CorrelationIdentifies highly correlated feature pairs in a dataset suggesting feature redundancy or multicollinearity....['dataset']{'max_threshold': 0.3}
validmind.data_validation.BivariateHistogramsBivariate HistogramsGenerates bivariate histograms for paired features, aiding in visual inspection of categorical variables'...['dataset']{'features_pairs': None, 'target_filter': None}
validmind.data_validation.WOEBinTableWOE Bin TableCalculates and assesses the Weight of Evidence (WoE) and Information Value (IV) of each feature in a ML model....['dataset']{'breaks_adj': None}
validmind.data_validation.HeatmapFeatureCorrelationsHeatmap Feature CorrelationsCreates a heatmap to visually represent correlation patterns between pairs of numerical features in a dataset....['dataset']{'declutter': None, 'fontsize': None, 'num_features': None}
validmind.data_validation.DatasetSplitDataset SplitEvaluates and visualizes the distribution proportions among training, testing, and validation datasets of an ML...['datasets']None
validmind.data_validation.BivariateScatterPlotsBivariate Scatter PlotsGenerates bivariate scatterplots to visually inspect relationships between pairs of predictor variables in machine...['dataset']{'selected_columns': None}
validmind.data_validation.WOEBinPlotsWOE Bin PlotsGenerates visualizations of Weight of Evidence (WoE) and Information Value (IV) for understanding predictive power...['dataset']{'breaks_adj': None, 'fig_height': 600, 'fig_width': 500}
validmind.data_validation.ClassImbalanceClass ImbalanceEvaluates and quantifies class distribution imbalance in a dataset used by a machine learning model....['dataset']{'min_percent_threshold': 10}
validmind.data_validation.IQROutliersBarPlotIQR Outliers Bar PlotVisualizes outlier distribution across percentiles in numerical data using Interquartile Range (IQR) method....['dataset']{'threshold': 1.5, 'num_features': None, 'fig_width': 800}
validmind.data_validation.TabularDateTimeHistogramsTabular Date Time HistogramsGenerates histograms to provide graphical insight into the distribution of time intervals in model's datetime data....['dataset']Nonevalidmind.data_validation.BivariateScatterPlotsBivariate Scatter PlotsGenerates bivariate scatterplots to visually inspect relationships between pairs of numerical predictor variables...['dataset']{}['tabular_data', 'numerical_data', 'visualization']['classification']
validmind.data_validation.ChiSquaredFeaturesTableChi Squared Features TableAssesses the statistical association between categorical features and a target variable using the Chi-Squared test....['dataset']{'p_threshold': {'type': '_empty', 'default': 0.05}}['tabular_data', 'categorical_data', 'statistical_test']['classification']
validmind.data_validation.ClassImbalanceClass ImbalanceEvaluates and quantifies class distribution imbalance in a dataset used by a machine learning model....['dataset']{'min_percent_threshold': {'type': 'int', 'default': 10}}['tabular_data', 'binary_classification', 'multiclass_classification', 'data_quality']['classification']
validmind.data_validation.DatasetDescriptionDataset DescriptionProvides comprehensive analysis and statistical summaries of each column in a machine learning model's dataset....['dataset']{}['tabular_data', 'time_series_data', 'text_data']['classification', 'regression', 'text_classification', 'text_summarization']
validmind.data_validation.DatasetSplitDataset SplitEvaluates and visualizes the distribution proportions among training, testing, and validation datasets of an ML...['datasets']{}['tabular_data', 'time_series_data', 'text_data']['classification', 'regression', 'text_classification', 'text_summarization']
validmind.data_validation.DescriptiveStatisticsDescriptive StatisticsPerforms a detailed descriptive statistical analysis of both numerical and categorical data within a model's...['dataset']{}['tabular_data', 'time_series_data', 'data_quality']['classification', 'regression']
validmind.data_validation.DuplicatesDuplicatesTests dataset for duplicate entries, ensuring model reliability via data quality verification....['dataset']{'min_threshold': {'type': '_empty', 'default': 1}}['tabular_data', 'data_quality', 'text_data']['classification', 'regression']
validmind.data_validation.FeatureTargetCorrelationPlotFeature Target Correlation PlotVisualizes the correlation between input features and the model's target output in a color-coded horizontal bar...['dataset']{'fig_height': {'type': '_empty', 'default': 600}}['tabular_data', 'visualization', 'correlation']['classification', 'regression']
validmind.data_validation.HighCardinalityHigh CardinalityAssesses the number of unique values in categorical columns to detect high cardinality and potential overfitting....['dataset']{'num_threshold': {'type': 'int', 'default': 100}, 'percent_threshold': {'type': 'float', 'default': 0.1}, 'threshold_type': {'type': 'str', 'default': 'percent'}}['tabular_data', 'data_quality', 'categorical_data']['classification', 'regression']
validmind.data_validation.HighPearsonCorrelationHigh Pearson CorrelationIdentifies highly correlated feature pairs in a dataset suggesting feature redundancy or multicollinearity....['dataset']{'max_threshold': {'type': 'float', 'default': 0.3}, 'top_n_correlations': {'type': 'int', 'default': 10}, 'feature_columns': {'type': 'list', 'default': None}}['tabular_data', 'data_quality', 'correlation']['classification', 'regression']
validmind.data_validation.IQROutliersBarPlotIQR Outliers Bar PlotVisualizes outlier distribution across percentiles in numerical data using the Interquartile Range (IQR) method....['dataset']{'threshold': {'type': 'float', 'default': 1.5}, 'fig_width': {'type': 'int', 'default': 800}}['tabular_data', 'visualization', 'numerical_data']['classification', 'regression']
validmind.data_validation.IQROutliersTableIQR Outliers TableDetermines and summarizes outliers in numerical features using the Interquartile Range method....['dataset']{'threshold': {'type': 'float', 'default': 1.5}}['tabular_data', 'numerical_data']['classification', 'regression']
validmind.data_validation.IsolationForestOutliersIsolation Forest OutliersDetects outliers in a dataset using the Isolation Forest algorithm and visualizes results through scatter plots....['dataset']{'random_state': {'type': 'int', 'default': 0}, 'contamination': {'type': 'float', 'default': 0.1}, 'feature_columns': {'type': 'list', 'default': None}}['tabular_data', 'anomaly_detection']['classification']
validmind.data_validation.JarqueBeraJarque BeraAssesses normality of dataset features in an ML model using the Jarque-Bera test....['dataset']{}['tabular_data', 'data_distribution', 'statistical_test', 'statsmodels']['classification', 'regression']
validmind.data_validation.MissingValuesMissing ValuesEvaluates dataset quality by ensuring missing value ratio across all features does not exceed a set threshold....['dataset']{'min_threshold': {'type': 'int', 'default': 1}}['tabular_data', 'data_quality']['classification', 'regression']
validmind.data_validation.MissingValuesBarPlotMissing Values Bar PlotAssesses the percentage and distribution of missing values in the dataset via a bar plot, with emphasis on...['dataset']{'threshold': {'type': 'int', 'default': 80}, 'fig_height': {'type': 'int', 'default': 600}}['tabular_data', 'data_quality', 'visualization']['classification', 'regression']
validmind.data_validation.MutualInformationMutual InformationCalculates mutual information scores between features and target variable to evaluate feature relevance....['dataset']{'min_threshold': {'type': 'float', 'default': 0.01}, 'task': {'type': 'str', 'default': 'classification'}}['feature_selection', 'data_analysis']['classification', 'regression']
validmind.data_validation.PearsonCorrelationMatrixPearson Correlation MatrixEvaluates linear dependency between numerical variables in a dataset via a Pearson Correlation coefficient heat map....['dataset']{}['tabular_data', 'numerical_data', 'correlation']['classification', 'regression']
validmind.data_validation.ProtectedClassesDescriptionProtected Classes DescriptionVisualizes the distribution of protected classes in the dataset relative to the target variable...['dataset']{'protected_classes': {'type': '_empty', 'default': None}}['bias_and_fairness', 'descriptive_statistics']['classification', 'regression']
validmind.data_validation.RunsTestRuns TestExecutes Runs Test on ML model to detect non-random patterns in output data sequence....['dataset']{}['tabular_data', 'statistical_test', 'statsmodels']['classification', 'regression']
validmind.data_validation.ScatterPlotScatter PlotAssesses visual relationships, patterns, and outliers among features in a dataset through scatter plot matrices....['dataset']{}['tabular_data', 'visualization']['classification', 'regression']
validmind.data_validation.ScoreBandDefaultRatesScore Band Default RatesAnalyzes default rates and population distribution across credit score bands....['dataset', 'model']{'score_column': {'type': 'str', 'default': 'score'}, 'score_bands': {'type': 'list', 'default': None}}['visualization', 'credit_risk', 'scorecard']['classification']
validmind.data_validation.ShapiroWilkShapiro WilkEvaluates feature-wise normality of training data using the Shapiro-Wilk test....['dataset']{}['tabular_data', 'data_distribution', 'statistical_test']['classification', 'regression']
validmind.data_validation.SkewnessSkewnessEvaluates the skewness of numerical data in a dataset to check against a defined threshold, aiming to ensure data...['dataset']{'max_threshold': {'type': '_empty', 'default': 1}}['data_quality', 'tabular_data']['classification', 'regression']
validmind.data_validation.TabularCategoricalBarPlotsTabular Categorical Bar PlotsGenerates and visualizes bar plots for each category in categorical features to evaluate the dataset's composition....['dataset']{}['tabular_data', 'visualization']['classification', 'regression']
validmind.data_validation.TabularDateTimeHistogramsTabular Date Time HistogramsGenerates histograms to provide graphical insight into the distribution of time intervals in a model's datetime...['dataset']{}['time_series_data', 'visualization']['classification', 'regression']
validmind.data_validation.TabularDescriptionTablesTabular Description TablesSummarizes key descriptive statistics for numerical, categorical, and datetime variables in a dataset....['dataset']{}['tabular_data']['classification', 'regression']
validmind.data_validation.TabularNumericalHistogramsTabular Numerical HistogramsGenerates histograms for each numerical feature in a dataset to provide visual insights into data distribution and...['dataset']{}['tabular_data', 'visualization']['classification', 'regression']
validmind.data_validation.TargetRateBarPlotsTarget Rate Bar PlotsGenerates bar plots visualizing the default rates of categorical features for a classification machine learning...['dataset']{}['tabular_data', 'visualization', 'categorical_data']['classification']
validmind.data_validation.TooManyZeroValuesToo Many Zero ValuesIdentifies numerical columns in a dataset that contain an excessive number of zero values, defined by a threshold...['dataset']{'max_percent_threshold': {'type': 'float', 'default': 0.03}}['tabular_data']['regression', 'classification']
validmind.data_validation.UniqueRowsUnique RowsVerifies the diversity of the dataset by ensuring that the count of unique rows exceeds a prescribed threshold....['dataset']{'min_percent_threshold': {'type': 'float', 'default': 1}}['tabular_data']['regression', 'classification']
validmind.data_validation.WOEBinPlotsWOE Bin PlotsGenerates visualizations of Weight of Evidence (WoE) and Information Value (IV) for understanding predictive power...['dataset']{'breaks_adj': {'type': 'list', 'default': None}, 'fig_height': {'type': 'int', 'default': 600}, 'fig_width': {'type': 'int', 'default': 500}}['tabular_data', 'visualization', 'categorical_data']['classification']
validmind.data_validation.WOEBinTableWOE Bin TableAssesses the Weight of Evidence (WoE) and Information Value (IV) of each feature to evaluate its predictive power...['dataset']{'breaks_adj': {'type': 'list', 'default': None}}['tabular_data', 'categorical_data']['classification']
validmind.model_validation.FeaturesAUCFeatures AUCEvaluates the discriminatory power of each individual feature within a binary classification model by calculating...['dataset']{'fontsize': {'type': 'int', 'default': 12}, 'figure_height': {'type': 'int', 'default': 500}}['feature_importance', 'AUC', 'visualization']['classification']
validmind.model_validation.sklearn.CalibrationCurveCalibration CurveEvaluates the calibration of probability estimates by comparing predicted probabilities against observed...['model', 'dataset']{'n_bins': {'type': 'int', 'default': 10}}['sklearn', 'model_performance', 'classification']['classification']
validmind.model_validation.sklearn.ClassifierPerformanceClassifier PerformanceEvaluates performance of binary or multiclass classification models using precision, recall, F1-Score, accuracy,...['dataset', 'model']{'average': {'type': 'str', 'default': 'macro'}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.ClassifierThresholdOptimizationClassifier Threshold OptimizationAnalyzes and visualizes different threshold optimization methods for binary classification models....['dataset', 'model']{'methods': {'type': None, 'default': None}, 'target_recall': {'type': None, 'default': None}}['model_validation', 'threshold_optimization', 'classification_metrics']['classification']
validmind.model_validation.sklearn.ConfusionMatrixConfusion MatrixEvaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix...['dataset', 'model']{'threshold': {'type': 'float', 'default': 0.5}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.HyperParametersTuningHyper Parameters TuningPerforms exhaustive grid search over specified parameter ranges to find optimal model configurations...['model', 'dataset']{'param_grid': {'type': 'dict', 'default': None}, 'scoring': {'type': None, 'default': None}, 'thresholds': {'type': None, 'default': None}, 'fit_params': {'type': 'dict', 'default': None}}['sklearn', 'model_performance']['clustering', 'classification']
validmind.model_validation.sklearn.MinimumAccuracyMinimum AccuracyChecks if the model's prediction accuracy meets or surpasses a specified threshold....['dataset', 'model']{'min_threshold': {'type': 'float', 'default': 0.7}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.MinimumF1ScoreMinimum F1 ScoreAssesses if the model's F1 score on the validation set meets a predefined minimum threshold, ensuring balanced...['dataset', 'model']{'min_threshold': {'type': 'float', 'default': 0.5}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.MinimumROCAUCScoreMinimum ROCAUC ScoreValidates model by checking if the ROC AUC score meets or surpasses a specified threshold....['dataset', 'model']{'min_threshold': {'type': 'float', 'default': 0.5}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.ModelParametersModel ParametersExtracts and displays model parameters in a structured format for transparency and reproducibility....['model']{'model_params': {'type': '_empty', 'default': None}}['model_training', 'metadata']['classification', 'regression']
validmind.model_validation.sklearn.ModelsPerformanceComparisonModels Performance ComparisonEvaluates and compares the performance of multiple Machine Learning models using various metrics like accuracy,...['dataset', 'models']{}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'model_comparison']['classification', 'text_classification']
validmind.model_validation.sklearn.OverfitDiagnosisOverfit DiagnosisAssesses potential overfitting in a model's predictions, identifying regions where performance between training and...['model', 'datasets']{'metric': {'type': 'str', 'default': None}, 'cut_off_threshold': {'type': 'float', 'default': 0.04}}['sklearn', 'binary_classification', 'multiclass_classification', 'linear_regression', 'model_diagnosis']['classification', 'regression']
validmind.model_validation.sklearn.PermutationFeatureImportancePermutation Feature ImportanceAssesses the significance of each feature in a model by evaluating the impact on model performance when feature...['model', 'dataset']{'fontsize': {'type': None, 'default': None}, 'figure_height': {'type': None, 'default': None}}['sklearn', 'binary_classification', 'multiclass_classification', 'feature_importance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.PopulationStabilityIndexPopulation Stability IndexAssesses the Population Stability Index (PSI) to quantify the stability of an ML model's predictions across...['datasets', 'model']{'num_bins': {'type': 'int', 'default': 10}, 'mode': {'type': 'str', 'default': 'fixed'}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.model_validation.sklearn.PrecisionRecallCurvePrecision Recall CurveEvaluates the precision-recall trade-off for binary classification models and visualizes the Precision-Recall curve....['model', 'dataset']{}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.ROCCurveROC CurveEvaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic...['model', 'dataset']{}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.RegressionErrorsRegression ErrorsAssesses the performance and error distribution of a regression model using various error metrics....['model', 'dataset']{}['sklearn', 'model_performance']['regression', 'classification']
validmind.model_validation.sklearn.RobustnessDiagnosisRobustness DiagnosisAssesses the robustness of a machine learning model by evaluating performance decay under noisy conditions....['datasets', 'model']{'metric': {'type': 'str', 'default': None}, 'scaling_factor_std_dev_list': {'type': None, 'default': [0.1, 0.2, 0.3, 0.4, 0.5]}, 'performance_decay_threshold': {'type': 'float', 'default': 0.05}}['sklearn', 'model_diagnosis', 'visualization']['classification', 'regression']
validmind.model_validation.sklearn.SHAPGlobalImportanceSHAP Global ImportanceEvaluates and visualizes global feature importance using SHAP values for model explanation and risk identification....['model', 'dataset']{'kernel_explainer_samples': {'type': 'int', 'default': 10}, 'tree_or_linear_explainer_samples': {'type': 'int', 'default': 200}, 'class_of_interest': {'type': None, 'default': None}}['sklearn', 'binary_classification', 'multiclass_classification', 'feature_importance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.ScoreProbabilityAlignmentScore Probability AlignmentAnalyzes the alignment between credit scores and predicted probabilities....['model', 'dataset']{'score_column': {'type': 'str', 'default': 'score'}, 'n_bins': {'type': 'int', 'default': 10}}['visualization', 'credit_risk', 'calibration']['classification']
validmind.model_validation.sklearn.TrainingTestDegradationTraining Test DegradationTests if model performance degradation between training and test datasets exceeds a predefined threshold....['datasets', 'model']{'max_threshold': {'type': 'float', 'default': 0.1}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.WeakspotsDiagnosisWeakspots DiagnosisIdentifies and visualizes weak spots in a machine learning model's performance across various sections of the...['datasets', 'model']{'features_columns': {'type': None, 'default': None}, 'metrics': {'type': None, 'default': None}, 'thresholds': {'type': None, 'default': None}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_diagnosis', 'visualization']['classification', 'text_classification']
validmind.model_validation.statsmodels.CumulativePredictionProbabilitiesCumulative Prediction ProbabilitiesVisualizes cumulative probabilities of positive and negative classes for both training and testing in classification models....['dataset', 'model']{'title': {'type': '_empty', 'default': 'Cumulative Probabilities'}}['visualization', 'credit_risk']['classification']
validmind.model_validation.statsmodels.GINITableGINI TableEvaluates classification model performance using AUC, GINI, and KS metrics for training and test datasets....['dataset', 'model']{}['model_performance']['classification']
validmind.model_validation.statsmodels.KolmogorovSmirnovKolmogorov SmirnovAssesses whether each feature in the dataset aligns with a normal distribution using the Kolmogorov-Smirnov test....['model', 'dataset']{'dist': {'type': 'str', 'default': 'norm'}}['tabular_data', 'data_distribution', 'statistical_test', 'statsmodels']['classification', 'regression']
validmind.model_validation.statsmodels.LillieforsLillieforsAssesses the normality of feature distributions in an ML model's training dataset using the Lilliefors test....['dataset']{}['tabular_data', 'data_distribution', 'statistical_test', 'statsmodels']['classification', 'regression']
validmind.model_validation.statsmodels.PredictionProbabilitiesHistogramPrediction Probabilities HistogramAssesses the predictive probability distribution for binary classification to evaluate model performance and...['dataset', 'model']{'title': {'type': '_empty', 'default': 'Histogram of Predictive Probabilities'}}['visualization', 'credit_risk']['classification']
validmind.model_validation.statsmodels.ScorecardHistogramScorecard HistogramThe Scorecard Histogram test evaluates the distribution of credit scores between default and non-default instances,...['dataset']{'title': {'type': '_empty', 'default': 'Histogram of Scores'}, 'score_column': {'type': '_empty', 'default': 'score'}}['visualization', 'credit_risk', 'logistic_regression']['classification']
validmind.ongoing_monitoring.CalibrationCurveDriftCalibration Curve DriftEvaluates changes in probability calibration between reference and monitoring datasets....['datasets', 'model']{'n_bins': {'type': 'int', 'default': 10}, 'drift_pct_threshold': {'type': 'float', 'default': 20}}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.ongoing_monitoring.ClassDiscriminationDriftClass Discrimination DriftCompares classification discrimination metrics between reference and monitoring datasets....['datasets', 'model']{'drift_pct_threshold': {'type': '_empty', 'default': 20}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.ongoing_monitoring.ClassImbalanceDriftClass Imbalance DriftEvaluates drift in class distribution between reference and monitoring datasets....['datasets']{'drift_pct_threshold': {'type': 'float', 'default': 5.0}, 'title': {'type': 'str', 'default': 'Class Distribution Drift'}}['tabular_data', 'binary_classification', 'multiclass_classification']['classification']
validmind.ongoing_monitoring.ClassificationAccuracyDriftClassification Accuracy DriftCompares classification accuracy metrics between reference and monitoring datasets....['datasets', 'model']{'drift_pct_threshold': {'type': '_empty', 'default': 20}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.ongoing_monitoring.ConfusionMatrixDriftConfusion Matrix DriftCompares confusion matrix metrics between reference and monitoring datasets....['datasets', 'model']{'drift_pct_threshold': {'type': '_empty', 'default': 20}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance']['classification', 'text_classification']
validmind.ongoing_monitoring.CumulativePredictionProbabilitiesDriftCumulative Prediction Probabilities DriftCompares cumulative prediction probability distributions between reference and monitoring datasets....['datasets', 'model']{}['visualization', 'credit_risk']['classification']
validmind.ongoing_monitoring.PredictionProbabilitiesHistogramDriftPrediction Probabilities Histogram DriftCompares prediction probability distributions between reference and monitoring datasets....['datasets', 'model']{'title': {'type': '_empty', 'default': 'Prediction Probabilities Histogram Drift'}, 'drift_pct_threshold': {'type': 'float', 'default': 20.0}}['visualization', 'credit_risk']['classification']
validmind.ongoing_monitoring.ROCCurveDriftROC Curve DriftCompares ROC curves between reference and monitoring datasets....['datasets', 'model']{}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.ongoing_monitoring.ScoreBandsDriftScore Bands DriftAnalyzes drift in population distribution and default rates across score bands....['datasets', 'model']{'score_column': {'type': 'str', 'default': 'score'}, 'score_bands': {'type': 'list', 'default': None}, 'drift_threshold': {'type': 'float', 'default': 20.0}}['visualization', 'credit_risk', 'scorecard']['classification']
validmind.ongoing_monitoring.ScorecardHistogramDriftScorecard Histogram DriftCompares score distributions between reference and monitoring datasets for each class....['datasets']{'score_column': {'type': 'str', 'default': 'score'}, 'title': {'type': 'str', 'default': 'Scorecard Histogram Drift'}, 'drift_pct_threshold': {'type': 'float', 'default': 20.0}}['visualization', 'credit_risk', 'logistic_regression']['classification']
validmind.unit_metrics.classification.AccuracyAccuracyCalculates the accuracy of a model['dataset', 'model']{}['classification']['classification']
validmind.unit_metrics.classification.F1F1Calculates the F1 score for a classification model.['model', 'dataset']{}['classification']['classification']
validmind.unit_metrics.classification.PrecisionPrecisionCalculates the precision for a classification model.['model', 'dataset']{}['classification']['classification']
validmind.unit_metrics.classification.ROC_AUCROC AUCCalculates the ROC AUC for a classification model.['model', 'dataset']{}['classification']['classification']
validmind.unit_metrics.classification.RecallRecallCalculates the recall for a classification model.['model', 'dataset']{}['classification']['classification']
\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 7, @@ -2290,64 +3321,94 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
IDNameDescriptionRequired InputsParamsIDNameDescriptionRequired InputsParamsTagsTasks
validmind.model_validation.sklearn.ConfusionMatrixConfusion MatrixEvaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix...['model', 'dataset']None
validmind.model_validation.sklearn.PrecisionRecallCurvePrecision Recall CurveEvaluates the precision-recall trade-off for binary classification models and visualizes the Precision-Recall curve....['model', 'dataset']None
validmind.model_validation.sklearn.ROCCurveROC CurveEvaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic...['model', 'dataset']None
validmind.model_validation.sklearn.TrainingTestDegradationTraining Test DegradationTests if model performance degradation between training and test datasets exceeds a predefined threshold....['model', 'datasets']{'metrics': ['accuracy', 'precision', 'recall', 'f1'], 'max_threshold': 0.1}
validmind.model_validation.statsmodels.GINITableGINI TableEvaluates classification model performance using AUC, GINI, and KS metrics for training and test datasets....['model', 'datasets']Nonevalidmind.model_validation.RegressionResidualsPlotRegression Residuals PlotEvaluates regression model performance using residual distribution and actual vs. predicted plots....['model', 'dataset']{'bin_size': {'type': 'float', 'default': 0.1}}['model_performance', 'visualization']['regression']
validmind.model_validation.sklearn.ConfusionMatrixConfusion MatrixEvaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix...['dataset', 'model']{'threshold': {'type': 'float', 'default': 0.5}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.PrecisionRecallCurvePrecision Recall CurveEvaluates the precision-recall trade-off for binary classification models and visualizes the Precision-Recall curve....['model', 'dataset']{}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.ROCCurveROC CurveEvaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic...['model', 'dataset']{}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.TrainingTestDegradationTraining Test DegradationTests if model performance degradation between training and test datasets exceeds a predefined threshold....['datasets', 'model']{'max_threshold': {'type': 'float', 'default': 0.1}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.ongoing_monitoring.CalibrationCurveDriftCalibration Curve DriftEvaluates changes in probability calibration between reference and monitoring datasets....['datasets', 'model']{'n_bins': {'type': 'int', 'default': 10}, 'drift_pct_threshold': {'type': 'float', 'default': 20}}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.ongoing_monitoring.ROCCurveDriftROC Curve DriftCompares ROC curves between reference and monitoring datasets....['datasets', 'model']{}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -2375,57 +3436,85 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
IDNameDescriptionRequired InputsParamsIDNameDescriptionRequired InputsParamsTagsTasks
validmind.model_validation.sklearn.ConfusionMatrixConfusion MatrixEvaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix...['model', 'dataset']None
validmind.model_validation.sklearn.PrecisionRecallCurvePrecision Recall CurveEvaluates the precision-recall trade-off for binary classification models and visualizes the Precision-Recall curve....['model', 'dataset']None
validmind.model_validation.sklearn.ROCCurveROC CurveEvaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic...['model', 'dataset']None
validmind.model_validation.sklearn.TrainingTestDegradationTraining Test DegradationTests if model performance degradation between training and test datasets exceeds a predefined threshold....['model', 'datasets']{'metrics': ['accuracy', 'precision', 'recall', 'f1'], 'max_threshold': 0.1}validmind.model_validation.sklearn.ConfusionMatrixConfusion MatrixEvaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix...['dataset', 'model']{'threshold': {'type': 'float', 'default': 0.5}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.PrecisionRecallCurvePrecision Recall CurveEvaluates the precision-recall trade-off for binary classification models and visualizes the Precision-Recall curve....['model', 'dataset']{}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.ROCCurveROC CurveEvaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic...['model', 'dataset']{}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.model_validation.sklearn.TrainingTestDegradationTraining Test DegradationTests if model performance degradation between training and test datasets exceeds a predefined threshold....['datasets', 'model']{'max_threshold': {'type': 'float', 'default': 0.1}}['sklearn', 'binary_classification', 'multiclass_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.ongoing_monitoring.CalibrationCurveDriftCalibration Curve DriftEvaluates changes in probability calibration between reference and monitoring datasets....['datasets', 'model']{'n_bins': {'type': 'int', 'default': 10}, 'drift_pct_threshold': {'type': 'float', 'default': 20}}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
validmind.ongoing_monitoring.ROCCurveDriftROC Curve DriftCompares ROC curves between reference and monitoring datasets....['datasets', 'model']{}['sklearn', 'binary_classification', 'model_performance', 'visualization']['classification', 'text_classification']
\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 9, @@ -2456,46 +3545,46 @@ { "data": { "text/plain": [ - "['validmind.prompt_validation.Bias',\n", - " 'validmind.prompt_validation.Clarity',\n", - " 'validmind.prompt_validation.Specificity',\n", - " 'validmind.prompt_validation.Robustness',\n", - " 'validmind.prompt_validation.NegativeInstruction',\n", - " 'validmind.prompt_validation.Conciseness',\n", - " 'validmind.prompt_validation.Delimitation',\n", + "['validmind.data_validation.DatasetDescription',\n", + " 'validmind.data_validation.DatasetSplit',\n", + " 'validmind.data_validation.nlp.CommonWords',\n", + " 'validmind.data_validation.nlp.Hashtags',\n", + " 'validmind.data_validation.nlp.LanguageDetection',\n", + " 'validmind.data_validation.nlp.Mentions',\n", + " 'validmind.data_validation.nlp.Punctuations',\n", + " 'validmind.data_validation.nlp.StopWords',\n", + " 'validmind.data_validation.nlp.TextDescription',\n", " 'validmind.model_validation.BertScore',\n", - " 'validmind.model_validation.RegardScore',\n", " 'validmind.model_validation.BleuScore',\n", " 'validmind.model_validation.ContextualRecall',\n", " 'validmind.model_validation.MeteorScore',\n", + " 'validmind.model_validation.RegardScore',\n", " 'validmind.model_validation.RougeScore',\n", - " 'validmind.model_validation.ModelMetadata',\n", " 'validmind.model_validation.TokenDisparity',\n", " 'validmind.model_validation.ToxicityScore',\n", " 'validmind.model_validation.embeddings.CosineSimilarityComparison',\n", - " 'validmind.model_validation.embeddings.TSNEComponentsPairwisePlots',\n", - " 'validmind.model_validation.embeddings.PCAComponentsPairwisePlots',\n", " 'validmind.model_validation.embeddings.CosineSimilarityHeatmap',\n", " 'validmind.model_validation.embeddings.EuclideanDistanceComparison',\n", " 'validmind.model_validation.embeddings.EuclideanDistanceHeatmap',\n", - " 'validmind.model_validation.ragas.ContextEntityRecall',\n", - " 'validmind.model_validation.ragas.Faithfulness',\n", - " 'validmind.model_validation.ragas.AspectCritique',\n", - " 'validmind.model_validation.ragas.AnswerSimilarity',\n", + " 'validmind.model_validation.embeddings.PCAComponentsPairwisePlots',\n", + " 'validmind.model_validation.embeddings.TSNEComponentsPairwisePlots',\n", " 'validmind.model_validation.ragas.AnswerCorrectness',\n", - " 'validmind.model_validation.ragas.ContextRecall',\n", - " 'validmind.model_validation.ragas.ContextRelevancy',\n", + " 'validmind.model_validation.ragas.AspectCritic',\n", + " 'validmind.model_validation.ragas.ContextEntityRecall',\n", " 'validmind.model_validation.ragas.ContextPrecision',\n", - " 'validmind.model_validation.ragas.AnswerRelevance',\n", - " 'validmind.data_validation.DatasetDescription',\n", - " 'validmind.data_validation.DatasetSplit',\n", - " 'validmind.data_validation.nlp.Punctuations',\n", - " 'validmind.data_validation.nlp.CommonWords',\n", - " 'validmind.data_validation.nlp.Hashtags',\n", - " 'validmind.data_validation.nlp.LanguageDetection',\n", - " 'validmind.data_validation.nlp.Mentions',\n", - " 'validmind.data_validation.nlp.TextDescription',\n", - " 'validmind.data_validation.nlp.StopWords']" + " 'validmind.model_validation.ragas.ContextPrecisionWithoutReference',\n", + " 'validmind.model_validation.ragas.ContextRecall',\n", + " 'validmind.model_validation.ragas.Faithfulness',\n", + " 'validmind.model_validation.ragas.NoiseSensitivity',\n", + " 'validmind.model_validation.ragas.ResponseRelevancy',\n", + " 'validmind.model_validation.ragas.SemanticSimilarity',\n", + " 'validmind.prompt_validation.Bias',\n", + " 'validmind.prompt_validation.Clarity',\n", + " 'validmind.prompt_validation.Conciseness',\n", + " 'validmind.prompt_validation.Delimitation',\n", + " 'validmind.prompt_validation.NegativeInstruction',\n", + " 'validmind.prompt_validation.Robustness',\n", + " 'validmind.prompt_validation.Specificity']" ] }, "execution_count": 10, @@ -2527,12 +3616,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "571210f026b14522a043157e2c9b708e", + "model_id": "5025f3a7dbb34f4c9de1b26e4909f3f7", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Accordion(children=(HTML(value='\\n
\\n

Overfit Diagnosis

\\n

Detects and visualizes overfit reg…" + "Accordion(children=(HTML(value='\\n

\\n

Overfit Diagnosis

\\n
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/how_to/log_metrics_over_time.ipynb b/site/notebooks/how_to/log_metrics_over_time.ipynb index d551d58ff1..bbf4b94cbc 100644 --- a/site/notebooks/how_to/log_metrics_over_time.ipynb +++ b/site/notebooks/how_to/log_metrics_over_time.ipynb @@ -78,7 +78,7 @@ "\n", "\n", "### New to ValidMind?\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/how_to/run_documentation_sections.ipynb b/site/notebooks/how_to/run_documentation_sections.ipynb index b7b43e3793..9066d637f1 100644 --- a/site/notebooks/how_to/run_documentation_sections.ipynb +++ b/site/notebooks/how_to/run_documentation_sections.ipynb @@ -73,7 +73,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/how_to/run_documentation_tests_with_config.ipynb b/site/notebooks/how_to/run_documentation_tests_with_config.ipynb index 0eea64a465..8ad291fe80 100644 --- a/site/notebooks/how_to/run_documentation_tests_with_config.ipynb +++ b/site/notebooks/how_to/run_documentation_tests_with_config.ipynb @@ -77,7 +77,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/how_to/run_tests/1_run_dataset_based_tests.ipynb b/site/notebooks/how_to/run_tests/1_run_dataset_based_tests.ipynb index dfbc4a0de7..acfb923f82 100644 --- a/site/notebooks/how_to/run_tests/1_run_dataset_based_tests.ipynb +++ b/site/notebooks/how_to/run_tests/1_run_dataset_based_tests.ipynb @@ -72,7 +72,7 @@ "\n", "\n", "### New to ValidMind?\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/how_to/run_tests/2_run_comparison_tests.ipynb b/site/notebooks/how_to/run_tests/2_run_comparison_tests.ipynb index 9370fc98be..ab8880c4c8 100644 --- a/site/notebooks/how_to/run_tests/2_run_comparison_tests.ipynb +++ b/site/notebooks/how_to/run_tests/2_run_comparison_tests.ipynb @@ -79,7 +79,7 @@ "\n", "\n", "### New to ValidMind?\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/how_to/run_tests_that_require_multiple_datasets.ipynb b/site/notebooks/how_to/run_tests_that_require_multiple_datasets.ipynb index 9fde462201..184594bf2b 100644 --- a/site/notebooks/how_to/run_tests_that_require_multiple_datasets.ipynb +++ b/site/notebooks/how_to/run_tests_that_require_multiple_datasets.ipynb @@ -75,7 +75,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/how_to/run_unit_metrics.ipynb b/site/notebooks/how_to/run_unit_metrics.ipynb index c6469e7ac9..163306ae3e 100644 --- a/site/notebooks/how_to/run_unit_metrics.ipynb +++ b/site/notebooks/how_to/run_unit_metrics.ipynb @@ -104,7 +104,7 @@ "\n", "### New to ValidMind? \n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/how_to/use_dataset_model_objects.ipynb b/site/notebooks/how_to/use_dataset_model_objects.ipynb index 44ef1b151b..abf449cb03 100644 --- a/site/notebooks/how_to/use_dataset_model_objects.ipynb +++ b/site/notebooks/how_to/use_dataset_model_objects.ipynb @@ -79,7 +79,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/quickstart_customer_churn_full_suite.ipynb b/site/notebooks/quickstart_customer_churn_full_suite.ipynb index 21c1aa17e8..d7a9c6b878 100644 --- a/site/notebooks/quickstart_customer_churn_full_suite.ipynb +++ b/site/notebooks/quickstart_customer_churn_full_suite.ipynb @@ -75,7 +75,7 @@ "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our [Get started with the ValidMind Library](https://docs.validmind.ai/developer/get-started-validmind-library.html), we recommend you explore the available resources for developers at some point. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", @@ -545,9 +545,9 @@ }, "gpuClass": "standard", "kernelspec": { - "display_name": "Python 3", + "display_name": "ValidMind Library", "language": "python", - "name": "python3" + "name": "validmind" }, "language_info": { "codemirror_mode": { diff --git a/site/notebooks/templates/about-validmind.ipynb b/site/notebooks/templates/about-validmind.ipynb index 1c135f2683..e982e60d01 100644 --- a/site/notebooks/templates/about-validmind.ipynb +++ b/site/notebooks/templates/about-validmind.ipynb @@ -31,7 +31,7 @@ "source": [ "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", diff --git a/site/notebooks/tutorials/model_development/101-set_up_validmind.ipynb b/site/notebooks/tutorials/model_development/1-set_up_validmind.ipynb similarity index 90% rename from site/notebooks/tutorials/model_development/101-set_up_validmind.ipynb rename to site/notebooks/tutorials/model_development/1-set_up_validmind.ipynb index c7d00ecd2b..46a002a83d 100644 --- a/site/notebooks/tutorials/model_development/101-set_up_validmind.ipynb +++ b/site/notebooks/tutorials/model_development/1-set_up_validmind.ipynb @@ -2,10 +2,10 @@ "cells": [ { "cell_type": "markdown", - "id": "97710f2a", + "id": "b6fa2ac0", "metadata": {}, "source": [ - "# ValidMind for model development — 101 Set up the ValidMind Library\n", + "# ValidMind for model development 1 — Set up the ValidMind Library\n", "\n", "Learn how to use ValidMind for your end-to-end model documentation process based on common model development scenarios with our series of four introductory notebooks. This first notebook walks you through the initial setup of the ValidMind Library.\n", "\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "d3bb0ff8", + "id": "fe2e0eca", "metadata": {}, "source": [ "::: {.content-hidden when-format=\"html\"}\n", @@ -30,6 +30,7 @@ " - [Get your code snippet](#toc3_2_1_) \n", "- [Getting to know ValidMind](#toc4_) \n", " - [Preview the documentation template](#toc4_1_) \n", + " - [View model documentation in the ValidMind Platform](#toc4_1_1_) \n", " - [Explore available tests](#toc4_2_) \n", "- [Upgrade ValidMind](#toc5_) \n", "- [In summary](#toc6_) \n", @@ -49,7 +50,7 @@ }, { "cell_type": "markdown", - "id": "d78e3887", + "id": "814da22c", "metadata": {}, "source": [ "\n", @@ -66,7 +67,7 @@ }, { "cell_type": "markdown", - "id": "f40a5e0a", + "id": "4b966a95", "metadata": {}, "source": [ "\n", @@ -80,7 +81,7 @@ }, { "cell_type": "markdown", - "id": "12af6ba2", + "id": "87936431", "metadata": {}, "source": [ "\n", @@ -94,14 +95,14 @@ }, { "cell_type": "markdown", - "id": "5f9cc87c", + "id": "cb9f8dc1", "metadata": {}, "source": [ "\n", "\n", "### New to ValidMind?\n", "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", "\n", "
For access to all features available in this notebook, create a free ValidMind account.\n", "

\n", @@ -110,7 +111,7 @@ }, { "cell_type": "markdown", - "id": "31c5cde0", + "id": "a0d16aca", "metadata": {}, "source": [ "\n", @@ -145,7 +146,7 @@ }, { "cell_type": "markdown", - "id": "1c06378f", + "id": "215d62a7", "metadata": {}, "source": [ "\n", @@ -157,7 +158,6 @@ }, { "cell_type": "markdown", - "id": "00f99235", "metadata": {}, "source": [ "\n", @@ -174,7 +174,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8883bbc3", + "id": "827eb6bd", "metadata": {}, "outputs": [], "source": [ @@ -183,7 +183,7 @@ }, { "cell_type": "markdown", - "id": "780b6b39", + "id": "5e37f9fe", "metadata": {}, "source": [ "\n", @@ -195,7 +195,6 @@ }, { "cell_type": "markdown", - "id": "ec5bdcec", "metadata": {}, "source": [ "\n", @@ -213,7 +212,7 @@ }, { "cell_type": "markdown", - "id": "d00f6f07", + "id": "48eb92b3", "metadata": {}, "source": [ " - Documentation template: `Binary classification`\n", @@ -235,7 +234,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5f22e91d", + "id": "a58d951f", "metadata": {}, "outputs": [], "source": [ @@ -258,7 +257,7 @@ }, { "cell_type": "markdown", - "id": "c3186121", + "id": "99cf2df8", "metadata": {}, "source": [ "\n", @@ -268,7 +267,6 @@ }, { "cell_type": "markdown", - "id": "3b4c604d", "metadata": {}, "source": [ "\n", @@ -283,7 +281,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32ab4cac", + "id": "819a40bc", "metadata": {}, "outputs": [], "source": [ @@ -292,7 +290,24 @@ }, { "cell_type": "markdown", - "id": "6f5341af", + "id": "cf63d701", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### View model documentation in the ValidMind Platform\n", + "\n", + "Next, let's head to the ValidMind Platform to see the template in action:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and select the model you registered for this \"ValidMind for model development\" series of notebooks.\n", + "\n", + "3. Click on the **Documentation** for your model and note how the structure of the documentation matches our preview above." + ] + }, + { + "cell_type": "markdown", "metadata": {}, "source": [ "\n", @@ -307,7 +322,7 @@ { "cell_type": "code", "execution_count": null, - "id": "acf76128", + "id": "7ccc7776", "metadata": {}, "outputs": [], "source": [ @@ -316,7 +331,6 @@ }, { "cell_type": "markdown", - "id": "4327631b", "metadata": {}, "source": [ "\n", @@ -331,7 +345,7 @@ { "cell_type": "code", "execution_count": null, - "id": "95bede03", + "id": "f5d3216d", "metadata": {}, "outputs": [], "source": [ @@ -352,7 +366,7 @@ }, { "cell_type": "markdown", - "id": "6a7bf101", + "id": "9b8aa1cc", "metadata": {}, "source": [ "You may need to restart your kernel after running the upgrade package for changes to be applied." @@ -360,7 +374,7 @@ }, { "cell_type": "markdown", - "id": "207875f2", + "id": "65ece5fb", "metadata": {}, "source": [ "\n", @@ -369,15 +383,15 @@ "\n", "In this first notebook, you learned how to:\n", "\n", - "- [ ] Register a model within the ValidMind Platform\n", - "- [ ] Install and initialize the ValidMind Library\n", - "- [ ] Preview the documentation template for your model\n", - "- [ ] Explore the available tests offered by the ValidMind Library" + "- [x] Register a model within the ValidMind Platform\n", + "- [x] Install and initialize the ValidMind Library\n", + "- [x] Preview the documentation template for your model\n", + "- [x] Explore the available tests offered by the ValidMind Library" ] }, { "cell_type": "markdown", - "id": "29781eb4", + "id": "a262f940", "metadata": {}, "source": [ "\n", @@ -387,14 +401,13 @@ }, { "cell_type": "markdown", - "id": "4eb45e03", "metadata": {}, "source": [ "\n", "\n", "### Start the model development process\n", "\n", - "Now that the ValidMind Library is connected to your model in the ValidMind Library with the correct template applied, we can go ahead and start the model development process: **[102 Start the model development process](102-start_development_process.ipynb)**" + "Now that the ValidMind Library is connected to your model in the ValidMind Library with the correct template applied, we can go ahead and start the model development process: **[2 — Start the model development process](2-start_development_process.ipynb)**" ] } ], diff --git a/site/notebooks/tutorials/model_development/102-start_development_process.ipynb b/site/notebooks/tutorials/model_development/2-start_development_process.ipynb similarity index 87% rename from site/notebooks/tutorials/model_development/102-start_development_process.ipynb rename to site/notebooks/tutorials/model_development/2-start_development_process.ipynb index b64c31426a..74bec6960a 100644 --- a/site/notebooks/tutorials/model_development/102-start_development_process.ipynb +++ b/site/notebooks/tutorials/model_development/2-start_development_process.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ValidMind for model development — 102 Start the model development process\n", + "# ValidMind for model development 2 — Start the model development process\n", "\n", "Learn how to use ValidMind for your end-to-end model documentation process with our series of four introductory notebooks. In this second notebook, you'll run tests and investigate results, then add the results or evidence to your documentation.\n", "\n", @@ -28,7 +28,7 @@ "- [Running tests](#toc3_) \n", " - [Run tabular data tests](#toc3_1_) \n", " - [Utilize test output](#toc3_2_) \n", - "- [Documenting results](#toc4_) \n", + "- [Documenting test results](#toc4_) \n", " - [Run and log multiple tests](#toc4_1_) \n", " - [Run and log an individual test](#toc4_2_) \n", " - [Add individual test results to model documentation](#toc4_2_1_) \n", @@ -62,12 +62,12 @@ "\n", "In order to log test results or evidence to your model documentation with this notebook, you'll need to first have:\n", "\n", - "- [ ] Registered a model within the ValidMind Platform with a predefined documentation template\n", - "- [ ] Installed the ValidMind Library in your local environment, allowing you to access all its features\n", + "- [x] Registered a model within the ValidMind Platform with a predefined documentation template\n", + "- [x] Installed the ValidMind Library in your local environment, allowing you to access all its features\n", "\n", "
Need help with the above steps?\n", "

\n", - "Refer to the first notebook in this series: 101 Set up ValidMind
\n" + "Refer to the first notebook in this series: 1 — Set up the ValidMind Library
" ] }, { @@ -167,7 +167,10 @@ "\n", "Next, let's say we want to do some data quality assessments by running a few individual tests.\n", "\n", - "Use the [`vm.tests.list_tests()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tests) introduced by the first notebook in this series in combination with [`vm.tests.list_tags()`](https://docs.validmind.ai/validmind/validmind/tests.html#list_tags) and [`vm.tests.list_tasks()`](https://docs.validmind.ai/validmind/validmind/tests.html#list_tasks) to find which prebuilt tests are relevant for data quality assessment:\n" + "Use the [`vm.tests.list_tests()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tests) introduced by the first notebook in this series in combination with [`vm.tests.list_tags()`](https://docs.validmind.ai/validmind/validmind/tests.html#list_tags) and [`vm.tests.list_tasks()`](https://docs.validmind.ai/validmind/validmind/tests.html#list_tasks) to find which prebuilt tests are relevant for data quality assessment:\n", + "\n", + "- **`tasks`** represent the kind of modeling task associated with a test. Here we'll focus on `classification` tasks.\n", + "- **`tags`** are free-form descriptions providing more details about the test, for example, what category the test falls into. Here we'll focus on the `data_quality` tag.\n" ] }, { @@ -176,8 +179,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Get the list of available tags\n", - "sorted(vm.tests.list_tags())" + "# Get the list of available task types\n", + "sorted(vm.tests.list_tasks())" ] }, { @@ -186,8 +189,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Get the list of available task types\n", - "sorted(vm.tests.list_tasks())" + "# Get the list of available tags\n", + "sorted(vm.tests.list_tags())" ] }, { @@ -208,6 +211,15 @@ "vm.tests.list_tests(task=\"classification\", tags=[\"tabular_data\", \"data_quality\"])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Want to learn more about navigating ValidMind tests?\n", + "

\n", + "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests
" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -252,11 +264,7 @@ "You run individual tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module. For the examples below, we'll pass in the following arguments:\n", "\n", "- **`test_id`** — The ID of the test to run, as seen in the `ID` column when you run `list_tests`. \n", - "- **`params`** — A dictionary of parameters for the test. These will override any `default_params` set in the test definition. \n", - "\n", - "
Want to learn more about ValidMind tests?\n", - "

\n", - "Refer to our notebook that includes code samples and usage of key functions: Explore tests
" + "- **`params`** — A dictionary of parameters for the test. These will override any `default_params` set in the test definition. " ] }, { @@ -386,7 +394,9 @@ "\n", "### Utilize test output\n", "\n", - "You can utilize the output from a ValidMind test for further use, for example, if you want to remove highly correlated features. Below we demonstrate how to retrieve the list of features with the highest correlation coefficients and use them to reduce the final list of features for modeling.\n", + "You can utilize the output from a ValidMind test for further use, for example, if you want to remove highly correlated features. Removing highly correlated features helps make the model simpler, more stable, and easier to understand.\n", + "\n", + "Below we demonstrate how to retrieve the list of features with the highest correlation coefficients and use them to reduce the final list of features for modeling.\n", "\n", "First, we'll run [`validmind.data_validation.HighPearsonCorrelation`](https://docs.validmind.ai/tests/data_validation/HighPearsonCorrelation.html) with the `balanced_raw_dataset` we initialized previously as input as is for comparison with later runs:" ] @@ -410,6 +420,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "The output above shows that the test did not pass according to the value we set for `max_threshold`.\n", + "\n", "`corr_result` is an object of type `TestResult`. We can inspect the result object to see what the test has produced:" ] }, @@ -512,7 +524,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Re-running the test with the reduced feature set should pass the test:\n" + "Re-running the test with the reduced feature set should pass the test:" ] }, { @@ -555,9 +567,9 @@ "source": [ "\n", "\n", - "## Documenting results\n", + "## Documenting test results\n", "\n", - "We've now done some analysis on two different datasets, and we should be able to document why certain things were done to the raw data with testing to support it.\n", + "Now that we've done some analysis on two different datasets, we can use ValidMind to easily document why certain things were done to our raw data with testing to support it.\n", "\n", "Every test result returned by the `run_test()` function has a [`.log()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#TestResult.log) that can be used to send the test results to the ValidMind Platform:\n", "\n", @@ -624,7 +636,7 @@ "\n", "### Run and log an individual test\n", "\n", - "Next, we'll use the previously initialized `vm_balanced_raw_dataset` (that had a highly correlated `Age` column) as input to run an individual test, then log the result to the ValidMind Platform.\n", + "Next, we'll use the previously initialized `vm_balanced_raw_dataset` (that still has a highly correlated `Age` column) as input to run an individual test, then log the result to the ValidMind Platform.\n", "\n", "When running individual tests, **you can use a custom `result_id` to tag the individual result with a unique identifier:** \n", "\n", @@ -669,7 +681,7 @@ "\n", "2. In the left sidebar that appears for your model, click **Documentation**.\n", "\n", - "3. Locate the Data Preparation section and click on **2.3 Correlations and Interactions** to expand that section.\n", + "3. Locate the Data Preparation section and click on **2.3. Correlations and Interactions** to expand that section.\n", "\n", "4. Hover under the Pearson Correlation Matrix content block until a horizontal dashed line with a **+** button appears, indicating that you can insert a new block.\n", "\n", @@ -688,7 +700,7 @@ "\n", "6. Finally, click **Insert 1 Test Result to Document** to add the test result to the documentation.\n", "\n", - " Confirm that the individual results for the high correlation test has been correctly inserted into section **2.3 Correlations and Interactions** of the documentation.\n", + " Confirm that the individual results for the high correlation test has been correctly inserted into section **2.3. Correlations and Interactions** of the documentation.\n", "\n", "7. Finalize the documentation by editing the test result's description block to explain the changes you made to the raw data and the reasons behind them as shown in the screenshot below:\n", "\n", @@ -758,7 +770,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Using `GridSearchCV`, we'll find the best-performing hyperparameters or settings and save them:" + "We'll split our preprocessed dataset into training and testing, to help assess how well the model generalizes to unseen data:\n", + "\n", + "- We start by dividing our `balanced_raw_no_age_df` dataset into training and test subsets using `train_test_split`, with 80% of the data allocated to training (`train_df`) and 20% to testing (`test_df`).\n", + "- From each subset, we separate the features (all columns except \"Exited\") into `X_train` and `X_test`, and the target column (\"Exited\") into `y_train` and `y_test`." ] }, { @@ -767,18 +782,30 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", "\n", - "# Split the input and target variables\n", - "X = balanced_raw_no_age_df.drop(\"Exited\", axis=1)\n", - "y = balanced_raw_no_age_df[\"Exited\"]\n", - "X_train, X_test, y_train, y_test = train_test_split(\n", - " X,\n", - " y,\n", - " test_size=0.2,\n", - " random_state=42,\n", - ")\n", + "train_df, test_df = train_test_split(balanced_raw_no_age_df, test_size=0.20)\n", + "\n", + "X_train = train_df.drop(\"Exited\", axis=1)\n", + "y_train = train_df[\"Exited\"]\n", + "X_test = test_df.drop(\"Exited\", axis=1)\n", + "y_test = test_df[\"Exited\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then using `GridSearchCV`, we'll find the best-performing hyperparameters or settings and save them:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", "\n", "# Logistic Regression grid params\n", "log_reg_params = {\n", @@ -805,9 +832,7 @@ "\n", "### Initialize model evaluation objects\n", "\n", - "The last step for evaluating the model's performance is to initialize the ValidMind `Dataset` and `Model` objects in preparation for assigning model predictions to each dataset.\n", - "\n", - "Use the `init_dataset` and [`init_model`](https://docs.validmind.ai/validmind/validmind.html#init_model) functions to initialize these objects:\n" + "The last step for evaluating the model's performance is to initialize the ValidMind `Dataset` and `Model` objects in preparation for assigning model predictions to each dataset." ] }, { @@ -816,11 +841,7 @@ "metadata": {}, "outputs": [], "source": [ - "train_df = X_train\n", - "train_df[\"Exited\"] = y_train\n", - "test_df = X_test\n", - "test_df[\"Exited\"] = y_test\n", - "\n", + "# Initialize the datasets into their own dataset objects\n", "vm_train_ds = vm.init_dataset(\n", " input_id=\"train_dataset_final\",\n", " dataset=train_df,\n", @@ -831,8 +852,24 @@ " input_id=\"test_dataset_final\",\n", " dataset=test_df,\n", " target_column=\"Exited\",\n", - ")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You'll also need to initialize a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data for each of our three models.\n", "\n", + "You simply initialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# Register the model\n", "vm_model = vm.init_model(log_reg, input_id=\"log_reg_model_v1\")" ] @@ -845,7 +882,10 @@ "\n", "### Assign predictions\n", "\n", - "Once the model has been registered you can assign model predictions to the training and test datasets. The [`assign_predictions()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#VMDataset.assign_predictions) from the `Dataset` object can link existing predictions to any number of models.\n", + "Once the model has been registered you can assign model predictions to the training and test datasets.\n", + "\n", + "- The [`assign_predictions()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#assign_predictions) from the `Dataset` object can link existing predictions to any number of models.\n", + "- This method links the model's class prediction values and probabilities to our `vm_train_ds` and `vm_test_ds` datasets.\n", "\n", "If no prediction values are passed, the method will compute predictions automatically:\n" ] @@ -912,14 +952,14 @@ "\n", "In this second notebook, you learned how to:\n", "\n", - "- [ ] Import a sample dataset\n", - "- [ ] Identify which tests you might want to run with ValidMind\n", - "- [ ] Initialize ValidMind datasets\n", - "- [ ] Run individual tests\n", - "- [ ] Utilize the output from tests you've run\n", - "- [ ] Log test results from sets of or individual tests as evidence to the ValidMind Platform\n", - "- [ ] Add supplementary individual test results to your documentation\n", - "- [ ] Assign model predictions to your ValidMind datasets\n" + "- [x] Import a sample dataset\n", + "- [x] Identify which tests you might want to run with ValidMind\n", + "- [x] Initialize ValidMind datasets\n", + "- [x] Run individual tests\n", + "- [x] Utilize the output from tests you've run\n", + "- [x] Log test results from sets of or individual tests as evidence to the ValidMind Platform\n", + "- [x] Add supplementary individual test results to your documentation\n", + "- [x] Assign model predictions to your ValidMind model objects\n" ] }, { @@ -939,7 +979,7 @@ "\n", "### Integrate custom tests\n", "\n", - "Now that you're familiar with the basics of using the ValidMind Library to run and log tests to provide evidence for your model documentation, let's learn how to incorporate your own custom tests into ValidMind: **[103 Integrate custom tests](103-integrate_custom_tests.ipynb)**" + "Now that you're familiar with the basics of using the ValidMind Library to run and log tests to provide evidence for your model documentation, let's learn how to incorporate your own custom tests into ValidMind: **[3 — Integrate custom tests](3-integrate_custom_tests.ipynb)**" ] } ], diff --git a/site/notebooks/tutorials/model_development/103-integrate_custom_tests.ipynb b/site/notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb similarity index 92% rename from site/notebooks/tutorials/model_development/103-integrate_custom_tests.ipynb rename to site/notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb index bcde58c62c..038f45c38a 100644 --- a/site/notebooks/tutorials/model_development/103-integrate_custom_tests.ipynb +++ b/site/notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ValidMind for model development — 103 Integrate custom tests\n", + "# ValidMind for model development 3 — Integrate custom tests\n", "\n", "Learn how to use ValidMind for your end-to-end model documentation process with our series of four introductory notebooks. In this third notebook, supplement ValidMind tests with your own and include them as additional evidence in your documentation.\n", "\n", @@ -65,22 +65,20 @@ "\n", "## Prerequisites\n", "\n", - "In order to log test results or evidence to your model documentation with this notebook, you'll need to first have:\n", + "In order to integrate custom tests with your model documentation with this notebook, you'll need to first have:\n", "\n", - "- [ ] Registered a model within the ValidMind Platform with a predefined documentation template\n", - "- [ ] Installed the ValidMind Library in your local environment, allowing you to access all its features\n", - "- [ ] Learned how to import and initialize datasets for use with ValidMind\n", - "- [ ] Understood the basics of how to run and log tests with ValidMind\n", - "- [ ] Inserted a test-driven block for the results of your `HighPearsonCorrelation:balanced_raw_dataset` test into your model's documentation\n", + "- [x] Registered a model within the ValidMind Platform with a predefined documentation template\n", + "- [x] Installed the ValidMind Library in your local environment, allowing you to access all its features\n", + "- [x] Learned how to import and initialize datasets for use with ValidMind\n", + "- [x] Understood the basics of how to run and log tests with ValidMind\n", + "- [x] Inserted a test-driven block for the results of your `HighPearsonCorrelation:balanced_raw_dataset` test into your model's documentation\n", "\n", "
Need help with the above steps?\n", "

\n", "Refer to the first two notebooks in this series:\n", "\n", - "
    \n", - "
  1. 101 Set up ValidMind
  2. \n", - "
  3. 102 Start the model development process
  4. \n", - "
\n", + "- 1 — Set up the ValidMind Library\n", + "- 2 — Start the model development process\n", "\n", "
\n" ] @@ -93,7 +91,7 @@ "\n", "## Setting up\n", "\n", - "This section should be quite familiar to you — as we performed the same actions in the previous notebook, **[102 Start the model development process](102-start_development_process.ipynb)**." + "This section should be quite familiar to you — as we performed the same actions in the previous notebook, **[2 — Start the model development process](2-start_development_process.ipynb)**." ] }, { @@ -342,18 +340,24 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.linear_model import LogisticRegression\n", + "# Split the processed dataset into train and test\n", "from sklearn.model_selection import train_test_split\n", "\n", - "# Split the input and target variables\n", - "X = balanced_raw_no_age_df.drop(\"Exited\", axis=1)\n", - "y = balanced_raw_no_age_df[\"Exited\"]\n", - "X_train, X_test, y_train, y_test = train_test_split(\n", - " X,\n", - " y,\n", - " test_size=0.2,\n", - " random_state=42,\n", - ")\n", + "train_df, test_df = train_test_split(balanced_raw_no_age_df, test_size=0.20)\n", + "\n", + "X_train = train_df.drop(\"Exited\", axis=1)\n", + "y_train = train_df[\"Exited\"]\n", + "X_test = test_df.drop(\"Exited\", axis=1)\n", + "y_test = test_df[\"Exited\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", "\n", "# Logistic Regression grid params\n", "log_reg_params = {\n", @@ -389,11 +393,6 @@ "metadata": {}, "outputs": [], "source": [ - "train_df = X_train\n", - "train_df[\"Exited\"] = y_train\n", - "test_df = X_test\n", - "test_df[\"Exited\"] = y_test\n", - "\n", "# Initialize the datasets into their own dataset objects\n", "vm_train_ds = vm.init_dataset(\n", " input_id=\"train_dataset_final\",\n", @@ -627,7 +626,7 @@ "- Since these are `VMDataset` or `VMModel` inputs, they have a special meaning.\n", "- When declaring a `dataset`, `model`, `datasets` or `models` argument in a custom test function, the ValidMind Library will expect these get passed as `inputs` to `run_test()` or `run_documentation_tests()`.\n", "\n", - "Re-running the confusion matrix with `normalize=True` looks like this:\n" + "Re-running the confusion matrix with `normalize=True` and our testing dataset looks like this:\n" ] }, { @@ -640,7 +639,7 @@ "result = vm.tests.run_test(\n", " \"my_custom_tests.ConfusionMatrix:test_dataset_normalized\",\n", " inputs={\"model\": vm_model, \"dataset\": vm_test_ds},\n", - " params={\"normalize\": True},\n", + " params={\"normalize\": True}\n", ")" ] }, @@ -652,7 +651,7 @@ "\n", "### Log the confusion matrix results\n", "\n", - "As we learned in **[102 Start the model development process](102-start_development_process.ipynb)** under **Documenting results** > **Run and log an individual tests**, you can log any result to the ValidMind Platform with the [`.log()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#TestResult.log) of the result object, allowing you to then add the result to the documentation.\n", + "As we learned in **[2 — Start the model development process](2-start_development_process.ipynb)** under **Documenting results** > **Run and log an individual tests**, you can log any result to the ValidMind Platform with the [`.log()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#TestResult.log) of the result object, allowing you to then add the result to the documentation.\n", "\n", "You can now do the same for the confusion matrix results:\n" ] @@ -735,9 +734,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After running the command above, confirm that the new `my_tests` directory was created successfully:\n", + "After running the command above, confirm that a new `my_tests` directory was created successfully. For example:\n", "\n", - "\"Screenshot" + "```\n", + "~/notebooks/tutorials/model_development/my_tests/\n", + "```" ] }, { @@ -781,8 +782,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "- [ ] Confirm that the `save()` method saved the `confusion_matrix` function to a file named `ConfusionMatrix.py` in the `my_tests` folder.\n", - "- [ ] Note that the new file provides some context on the origin of the test, which is useful for traceability:\n", + "- [x] Confirm that the `save()` method saved the `confusion_matrix` function to a file named `ConfusionMatrix.py` in the `my_tests` folder.\n", + "- [x] Note that the new file provides some context on the origin of the test, which is useful for traceability:\n", "\n", " ```\n", " # Saved from __main__.confusion_matrix\n", @@ -790,7 +791,7 @@ " # New Test ID: .ConfusionMatrix\n", " ```\n", "\n", - "- [ ] Additionally, the new test function has been stripped off its decorator, as it now resides in a file that will be loaded by the test provider:\n", + "- [x] Additionally, the new test function has been stripped off its decorator, as it now resides in a file that will be loaded by the test provider:\n", "\n", " ```python\n", " def ConfusionMatrix(dataset, model, normalize=False):\n", @@ -808,7 +809,7 @@ "Now that your `my_tests` folder has a sample custom test, let's initialize a test provider that will tell the ValidMind Library where to find your custom tests:\n", "\n", "- ValidMind offers out-of-the-box test providers for local tests (tests in a folder) or a Github provider for tests in a Github repository.\n", - "- You can also create your own test provider by creating a class that has a [`load_test` method](https://docs.validmind.ai/validmind/validmind/tests.html#TestProvider.load_test) that takes a test ID and returns the test function matching that ID.\n", + "- You can also create your own test provider by creating a class that has a [`load_test` method](https://docs.validmind.ai/validmind/validmind/tests.html#load_test) that takes a test ID and returns the test function matching that ID.\n", "\n", "
Want to learn more about test providers?\n", "

\n", @@ -862,7 +863,7 @@ "- For tests that reside in a test provider directory, the test ID will be the `namespace` specified when registering the provider, followed by the path to the test file relative to the tests folder.\n", "- For example, the Confusion Matrix test we created earlier will have the test ID `my_test_provider.ConfusionMatrix`. You could organize the tests in subfolders, say `classification` and `regression`, and the test ID for the Confusion Matrix test would then be `my_test_provider.classification.ConfusionMatrix`.\n", "\n", - "Let's go ahead and re-run the confusion matrix test by using the test ID `my_test_provider.ConfusionMatrix`. This should load the test from the test provider and run it as before.\n" + "Let's go ahead and re-run the confusion matrix test with our testing dataset by using the test ID `my_test_provider.ConfusionMatrix`. This should load the test from the test provider and run it as before.\n" ] }, { @@ -905,7 +906,7 @@ "\n", "2. In the left sidebar that appears for your model, click **Documentation**.\n", "\n", - "3. Locate the Data Preparation section and click on **3.2 Model Evaluation** to expand that section.\n", + "3. Locate the Data Preparation section and click on **3.2. Model Evaluation** to expand that section.\n", "\n", "4. Hover under the Pearson Correlation Matrix content block until a horizontal dashed line with a **+** button appears, indicating that you can insert a new block.\n", "\n", @@ -922,7 +923,7 @@ "\n", "6. Finally, click **Insert 2 Test Results to Document** to add the test results to the documentation.\n", "\n", - " Confirm that the two individual results for the confusion matrix tests have been correctly inserted into section **3.2 Model Evaluation** of the documentation." + " Confirm that the two individual results for the confusion matrix tests have been correctly inserted into section **3.2. Model Evaluation** of the documentation." ] }, { @@ -935,10 +936,10 @@ "\n", "In this third notebook, you learned how to:\n", "\n", - "- [ ] Implement a custom inline test\n", - "- [ ] Run and log your custom inline tests\n", - "- [ ] Use external custom test providers\n", - "- [ ] Run and log tests from your custom test providers" + "- [x] Implement a custom inline test\n", + "- [x] Run and log your custom inline tests\n", + "- [x] Use external custom test providers\n", + "- [x] Run and log tests from your custom test providers" ] }, { @@ -958,7 +959,7 @@ "\n", "### Finalize testing and documentation\n", "\n", - "Now that you're proficient at using the ValidMind Library to run and log tests, let's put the last pieces in place to prepare our fully documented sample model for review: **[104 Finalize testing and documentation](104-finalize_testing_documentation.ipynb)**" + "Now that you're proficient at using the ValidMind Library to run and log tests, let's put the last pieces in place to prepare our fully documented sample model for review: **[4 — Finalize testing and documentation](4-finalize_testing_documentation.ipynb)**" ] } ], diff --git a/site/notebooks/tutorials/model_development/104-finalize_testing_documentation.ipynb b/site/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb similarity index 94% rename from site/notebooks/tutorials/model_development/104-finalize_testing_documentation.ipynb rename to site/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb index fd17733fcd..13a4f1e148 100644 --- a/site/notebooks/tutorials/model_development/104-finalize_testing_documentation.ipynb +++ b/site/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb @@ -4,11 +4,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ValidMind for model development — 104 Finalize testing and documentation\n", + "# ValidMind for model development 4 — Finalize testing and documentation\n", "\n", "Learn how to use ValidMind for your end-to-end model documentation process with our introductory notebook series. In this last notebook, finalize the testing and documentation of your model and have a fully documented sample model ready for review.\n", "\n", - "We'll first use [`run_documentation_tests()`](https://docs.validmind.ai/validmind/validmind.html#run_documentation_tests) previously covered in **[102 Start the model development process](102-start_development_process.ipynb)** to ensure that your custom test results generated in **[103 Integrate custom tests](103-integrate_custom_tests.ipynb)** are included in your documentation. Then, we'll view and update the configuration for the entire model documentation template to suit your needs.\n" + "We'll first use [`run_documentation_tests()`](https://docs.validmind.ai/validmind/validmind.html#run_documentation_tests) previously covered in **[2 — Start the model development process](2-start_development_process.ipynb)** to ensure that your custom test results generated in **[3 — Integrate custom tests](3-integrate_custom_tests.ipynb)** are included in your documentation. Then, we'll view and update the configuration for the entire model documentation template to suit your needs.\n" ] }, { @@ -61,24 +61,22 @@ "\n", "In order to finalize the testing and documentation for your sample model, you'll need to first have:\n", "\n", - "- [ ] Registered a model within the ValidMind Platform with a predefined documentation template\n", - "- [ ] Installed the ValidMind Library in your local environment, allowing you to access all its features\n", - "- [ ] Learned how to import and initialize datasets for use with ValidMind\n", - "- [ ] Learned how to run and log default and custom tests with ValidMind, including from external test providers\n", - "- [ ] Inserted test-driven blocks for the results of the following tests into your model's documentation:\n", - " - [ ] `HighPearsonCorrelation:balanced_raw_dataset`\n", - " - [ ] `my_test_provider.ConfusionMatrix`\n", - " - [ ] `my_custom_tests.ConfusionMatrix:test_dataset_normalized`\n", + "- [x] Registered a model within the ValidMind Platform with a predefined documentation template\n", + "- [x] Installed the ValidMind Library in your local environment, allowing you to access all its features\n", + "- [x] Learned how to import and initialize datasets for use with ValidMind\n", + "- [x] Learned how to run and log default and custom tests with ValidMind, including from external test providers\n", + "- [x] Inserted test-driven blocks for the results of the following tests into your model's documentation:\n", + " - [x] `HighPearsonCorrelation:balanced_raw_dataset`\n", + " - [x] `my_test_provider.ConfusionMatrix`\n", + " - [x] `my_custom_tests.ConfusionMatrix:test_dataset_normalized`\n", "\n", "
Need help with the above steps?\n", "

\n", "Refer to the first three notebooks in this series:\n", "\n", - "
    \n", - "
  1. 101 Set up ValidMind
  2. \n", - "
  3. 102 Start the model development process
  4. \n", - "
  5. 103 Integrate custom tests
  6. \n", - "
\n", + "- 1 — Set up the ValidMind Library\n", + "- 2 — Start the model development process\n", + "- 3 — Integrate custom tests\n", "\n", "
" ] @@ -148,7 +146,7 @@ "\n", "### Import sample dataset\n", "\n", - "Next, we'll import the same public [Bank Customer Churn Prediction](https://www.kaggle.com/datasets/shantanudhakadd/bank-customer-churn-prediction) dataset from Kaggle we used in the last notebook so that we have something to work with:" + "Next, we'll import the same public [Bank Customer Churn Prediction](https://www.kaggle.com/datasets/shantanudhakadd/bank-customer-churn-prediction) dataset from Kaggle we used in the last notebooks so that we have something to work with:" ] }, { @@ -340,18 +338,24 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.linear_model import LogisticRegression\n", + "# Split the processed dataset into train and test\n", "from sklearn.model_selection import train_test_split\n", "\n", - "# Split the input and target variables\n", - "X = balanced_raw_no_age_df.drop(\"Exited\", axis=1)\n", - "y = balanced_raw_no_age_df[\"Exited\"]\n", - "X_train, X_test, y_train, y_test = train_test_split(\n", - " X,\n", - " y,\n", - " test_size=0.2,\n", - " random_state=42,\n", - ")\n", + "train_df, test_df = train_test_split(balanced_raw_no_age_df, test_size=0.20)\n", + "\n", + "X_train = train_df.drop(\"Exited\", axis=1)\n", + "y_train = train_df[\"Exited\"]\n", + "X_test = test_df.drop(\"Exited\", axis=1)\n", + "y_test = test_df[\"Exited\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", "\n", "# Logistic Regression grid params\n", "log_reg_params = {\n", @@ -387,11 +391,6 @@ "metadata": {}, "outputs": [], "source": [ - "train_df = X_train\n", - "train_df[\"Exited\"] = y_train\n", - "test_df = X_test\n", - "test_df[\"Exited\"] = y_test\n", - "\n", "# Initialize the datasets into their own dataset objects\n", "vm_train_ds = vm.init_dataset(\n", " input_id=\"train_dataset_final\",\n", @@ -638,7 +637,7 @@ "\n", "Let's run all tests in the Model Evaluation section of the documentation. Note that we have been running the sample custom confusion matrix with `normalize=True` to demonstrate the ability to provide custom parameters.\n", "\n", - "In the **Run the model evaluation tests** section of **[102 Start the model development process](102-start_development_process.ipynb)**, you learned how to assign inputs to individual tests with [`run_documentation_tests()`](https://docs.validmind.ai/validmind/validmind.html#run_documentation_tests). Assigning parameters is similar, you only need to provide assign a `params` dictionary to a given test ID, `my_test_provider.ConfusionMatrix` in this case.\n" + "In the **Run the model evaluation tests** section of **[2 — Start the model development process](2-start_development_process.ipynb)**, you learned how to assign inputs to individual tests with [`run_documentation_tests()`](https://docs.validmind.ai/validmind/validmind.html#run_documentation_tests). Assigning parameters is similar, you only need to provide assign a `params` dictionary to a given test ID, `my_test_provider.ConfusionMatrix` in this case.\n" ] }, { @@ -864,9 +863,9 @@ "\n", "In this final notebook, you learned how to:\n", "\n", - "- [ ] Refresh the connection from the ValidMind Library to the ValidMind Platform after you've inserted test-driven blocks to your documentation\n", - "- [ ] Include custom test results in your model documentation\n", - "- [ ] View and configure the configuration for your model documentation template\n", + "- [x] Refresh the connection from the ValidMind Library to the ValidMind Platform after you've inserted test-driven blocks to your documentation\n", + "- [x] Include custom test results in your model documentation\n", + "- [x] View and configure the configuration for your model documentation template\n", "\n", "With our ValidMind for model development series of notebooks, you learned how to document a model end-to-end with the ValidMind Library by running through some common scenarios in a typical model development setting:\n", "\n", @@ -929,6 +928,7 @@ "#### More how-to guides and code samples\n", "\n", "- [Explore available tests in detail](../../how_to/explore_tests.ipynb)\n", + "- [In-depth guide on running dataset based tests](../../how_to/run_tests/1_run_dataset_based_tests.ipynb)\n", "- [In-depth guide for implementing custom tests](../../code_samples/custom_tests/implement_custom_tests.ipynb)\n", "- [In-depth guide to external test providers](../../code_samples/custom_tests/integrate_external_test_providers.ipynb)\n", "- [Configuring dataset features](../../how_to/configure_dataset_features.ipynb)\n", diff --git a/site/notebooks/tutorials/model_development/my_tests_directory.png b/site/notebooks/tutorials/model_development/my_tests_directory.png deleted file mode 100644 index 47baffe80e..0000000000 Binary files a/site/notebooks/tutorials/model_development/my_tests_directory.png and /dev/null differ diff --git a/site/notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb b/site/notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb new file mode 100644 index 0000000000..2f85b39d77 --- /dev/null +++ b/site/notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb @@ -0,0 +1,451 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a9d0996f", + "metadata": {}, + "source": [ + "# ValidMind for model validation 1 — Set up the ValidMind Library for validation\n", + "\n", + "Learn how to use ValidMind for your end-to-end model validation process based on common scenarios with our series of four introductory notebooks. In this first notebook, set up the ValidMind Library in preparation for validating a champion model.\n", + "\n", + "These notebooks use a binary classification model as an example, but the same principles shown here apply to other model types." + ] + }, + { + "cell_type": "markdown", + "id": "c747db34", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [Introduction](#toc1_) \n", + "- [About ValidMind](#toc2_) \n", + " - [Before you begin](#toc2_1_) \n", + " - [New to ValidMind?](#toc2_2_) \n", + " - [Key concepts](#toc2_3_) \n", + "- [Setting up](#toc3_) \n", + " - [Register a sample model](#toc3_1_) \n", + " - [Assign validator credentials](#toc3_1_1_) \n", + " - [Install the ValidMind Library](#toc3_2_) \n", + " - [Initialize the ValidMind Library](#toc3_3_) \n", + " - [Get your code snippet](#toc3_3_1_) \n", + "- [Getting to know ValidMind](#toc4_) \n", + " - [Preview the validation report template](#toc4_1_) \n", + " - [View validation report in the ValidMind Platform](#toc4_1_1_) \n", + " - [Explore available tests](#toc4_2_) \n", + "- [Upgrade ValidMind](#toc5_) \n", + "- [In summary](#toc6_) \n", + "- [Next steps](#toc7_) \n", + " - [Start the model validation process](#toc7_1_) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "f1d4715f", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Introduction\n", + "\n", + "Model validation aims to independently assess the compliance of *champion models* created by model developers with regulatory guidance by conducting thorough testing and analysis, potentially including the use of challenger models to benchmark performance. Assessments, presented in the form of a validation report, typically include *model findings* and recommendations to address those issues.\n", + "\n", + "A *binary classification model* is a type of predictive model used in churn analysis to identify customers who are likely to leave a service or subscription by analyzing various behavioral, transactional, and demographic factors.\n", + "\n", + "- This model helps businesses take proactive measures to retain at-risk customers by offering personalized incentives, improving customer service, or adjusting pricing strategies.\n", + "- Effective validation of a churn prediction model ensures that businesses can accurately identify potential churners, optimize retention efforts, and enhance overall customer satisfaction while minimizing revenue loss." + ] + }, + { + "cell_type": "markdown", + "id": "14c2d80d", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate comparison and other validation tests, and then use the ValidMind Platform to submit compliance assessments of champion models via comprehensive validation reports. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model developers." + ] + }, + { + "cell_type": "markdown", + "id": "151a4ca5", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html)." + ] + }, + { + "cell_type": "markdown", + "id": "089c960e", + "metadata": {}, + "source": [ + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, create a free ValidMind account.\n", + "

\n", + "Signing up is FREE — Register with ValidMind
" + ] + }, + { + "cell_type": "markdown", + "id": "5f307177", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Validation report**: A comprehensive and structured assessment of a model’s development and performance, focusing on verifying its integrity, appropriateness, and alignment with its intended use. It includes analyses of model assumptions, data quality, performance metrics, outcomes of testing procedures, and risk considerations. The validation report supports transparency, regulatory compliance, and informed decision-making by documenting the validator’s independent review and conclusions.\n", + "\n", + "**Validation report template**: Serves as a standardized framework for conducting and documenting model validation activities. It outlines the required sections, recommended analyses, and expected validation tests, ensuring consistency and completeness across validation reports. The template helps guide validators through a systematic review process while promoting comparability and traceability of validation outcomes.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets.\n", + "\n", + "**Metrics**: A subset of tests that do not have thresholds. In the context of this notebook, metrics and tests can be thought of as interchangeable concepts.\n", + "\n", + "**Custom metrics**: Custom metrics are functions that you define to evaluate your model or dataset. These functions can be registered with the ValidMind Library to be used in the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom metric.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom metric. (Learn more: [Run tests with multiple datasets](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html))\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a metric, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom metrics can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures." + ] + }, + { + "cell_type": "markdown", + "id": "c42665b8", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "id": "0faed42c", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Register a sample model\n", + "\n", + "In a usual model lifecycle, a champion model will have been independently registered in your model inventory and submitted to you for validation by your model development team as part of the effective challenge process. (**Learn more:** [Submit for approval](https://docs.validmind.ai/guide/model-documentation/submit-for-approval.html))\n", + "\n", + "For this series of notebooks, we'll have you register a dummy model in the ValidMind Platform inventory and assign yourself as the validator to familiarize you with the ValidMind interface and circumvent the need for an existing model:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + " For example, to register a model for use with this notebook, select:\n", + "\n", + " - Documentation template: `Binary classification`\n", + " - Use case: `Marketing/Sales - Attrition/Churn Management`\n", + "\n", + " You can fill in other options according to your preference." + ] + }, + { + "cell_type": "markdown", + "id": "0c350e0d", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Assign validator credentials\n", + "\n", + "In order to log tests as a validator instead of as a developer, on the model details page that appears after you've successfully registered your sample model:\n", + "\n", + "1. Remove yourself as a developer: \n", + "\n", + " - Click on the **DEVELOPERS** tile.\n", + " - Click the **x** next to your name to remove yourself from that model's role.\n", + " - Click **Save** to apply your changes to that role.\n", + "\n", + "2. Add yourself as a validator: \n", + "\n", + " - Click on the **VALIDATORS** tile.\n", + " - Select your name from the drop-down menu.\n", + " - Click **Save** to apply your changes to that role." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "
Recommended Python versions\n", + "

\n", + "Python 3.8 <= x <= 3.11
\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "931d8f7f", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "id": "5ec7fcb7", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your validation environment. You initialize the ValidMind Library with this code snippet, which ensures that your test results are uploaded to the correct model when you run the notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and select the model you registered for this \"ValidMind for model validation\" series of notebooks.\n", + "\n", + "3. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5d87e2d", + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b4b5a00f", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Getting to know ValidMind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Preview the validation report template\n", + "\n", + "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for model validation. A template predefines sections for your validation report and provides a general outline to follow, making the validation process much easier.\n", + "\n", + "You will attach evidence to this template in the form of risk assessment notes, findings, and test results later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13d34bbb", + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "id": "a2e86bc8", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### View validation report in the ValidMind Platform\n", + "\n", + "Next, let's head to the ValidMind Platform to see the template in action:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and select the model you registered for this \"ValidMind for model validation\" series of notebooks.\n", + "\n", + "3. Click on the **Validation Report** for your model and note:\n", + "\n", + " - [x] The risk assessment compliance summary at the top of the report (screenshot below)\n", + " - [x] How the structure of the validation report reflects the previewed template\n", + "\n", + " \"Screenshot\n", + "

" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Explore available tests\n", + "\n", + "Next, let's explore the list of all available tests in the ValidMind Library with [the `vm.tests.list_tests()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tests) — we'll later narrow down the tests we want to run from this list when we learn to run tests." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de6abc2a", + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.list_tests()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10272aa9", + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "id": "upgrade-version-d64591ca-3073-4b3e-9586-d3577adda203", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "44657dea", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "39f45f58", + "metadata": {}, + "source": [ + "\n", + "\n", + "## In summary\n", + "\n", + "In this first notebook, you learned how to:\n", + "\n", + "- [x] Register a model within the ValidMind Platform and assign yourself as the validator\n", + "- [x] Install and initialize the ValidMind Library\n", + "- [x] Preview the validation report template for your model\n", + "- [x] Explore the available tests offered by the ValidMind Library\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps\n", + "\n", + "\n", + "\n", + "### Start the model validation process\n", + "\n", + "Now that the ValidMind Library is connected to your model in the ValidMind Library with the correct template applied, we can go ahead and start the model validation process: **[2 — Start the model validation process](2-start_validation_process.ipynb)**" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ValidMind Library", + "language": "python", + "name": "validmind" + }, + "language_info": { + "name": "python", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/site/notebooks/tutorials/model_validation/2-start_validation_process.ipynb b/site/notebooks/tutorials/model_validation/2-start_validation_process.ipynb new file mode 100644 index 0000000000..5493f1f9c4 --- /dev/null +++ b/site/notebooks/tutorials/model_validation/2-start_validation_process.ipynb @@ -0,0 +1,873 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ValidMind for model validation 2 — Start the model validation process\n", + "\n", + "Learn how to use ValidMind for your end-to-end model validation process with our series of four introductory notebooks. In this second notebook, independently verify the data quality tests performed on the dataset used to train the champion model.\n", + "\n", + "You'll learn how to run relevant validation tests with ValidMind, log the results of those tests to the ValidMind Platform, and insert your logged test results as evidence into your validation report. You'll become familiar with the tests available in ValidMind, as well as how to run them. Running tests during model validation is crucial to the effective challenge process, as we want to independently evaluate the evidence and assessments provided by the model development team.\n", + "\n", + "While running our tests in this notebook, we'll focus on:\n", + "\n", + "- Ensuring that data used for training and testing the model is of appropriate data quality\n", + "- Ensuring that the raw data has been preprocessed appropriately and that the resulting final datasets reflects this\n", + "\n", + "**For a full list of out-of-the-box tests,** refer to our [Test descriptions](https://docs.validmind.ai/developer/model-testing/test-descriptions.html) or try the interactive [Test sandbox](https://docs.validmind.ai/developer/model-testing/test-sandbox.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [Prerequisites](#toc1_) \n", + "- [Setting up](#toc2_) \n", + " - [Initialize the ValidMind Library](#toc2_1_) \n", + "- [Load the sample dataset](#toc3_) \n", + "- [Verifying data quality adjustments](#toc4_) \n", + " - [Identify qualitative tests](#toc4_1_) \n", + " - [Initialize the ValidMind datasets](#toc4_2_) \n", + " - [Run data quality tests](#toc4_3_) \n", + " - [Run tabular data tests](#toc4_3_1_) \n", + " - [Remove highly correlated features](#toc4_4_) \n", + "- [Documenting test results](#toc5_) \n", + " - [Configure and run comparison tests](#toc5_1_) \n", + " - [Log tests with a unique identifiers](#toc5_2_) \n", + " - [Add test results to reporting](#toc5_3_) \n", + "- [Split the preprocessed dataset](#toc6_) \n", + " - [Initialize the split datasets](#toc6_1_) \n", + "- [In summary](#toc7_) \n", + "- [Next steps](#toc8_) \n", + " - [Develop potential challenger models](#toc8_1_) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Prerequisites\n", + "\n", + "In order to independently assess the quality of your datasets with notebook, you'll need to first have:\n", + "\n", + "- [x] Registered a model within the ValidMind Platform and granted yourself access to the model as a validator\n", + "- [x] Installed the ValidMind Library in your local environment, allowing you to access all its features\n", + "\n", + "
Need help with the above steps?\n", + "

\n", + "Refer to the first notebook in this series: 1 — Set up the ValidMind Library for validation
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library\n", + "\n", + "First, let's connect up the ValidMind Library to our model we previously registered in the ValidMind Platform:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and select the model you registered for this \"ValidMind for model validation\" series of notebooks.\n", + "\n", + "3. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Make sure the ValidMind Library is installed\n", + "\n", + "%pip install -q validmind\n", + "\n", + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Load the sample dataset\n", + "\n", + "Let's first import the public [Bank Customer Churn Prediction](https://www.kaggle.com/datasets/shantanudhakadd/bank-customer-churn-prediction) dataset from Kaggle, which was used to develop the dummy champion model.\n", + "\n", + "We'll use this dataset to review steps that should have been conducted during the initial development and documentation of the model to ensure that the model was built correctly. By independently performing steps taken by the model development team, we can confirm whether the model was built using appropriate and properly processed data.\n", + "\n", + "In our below example, note that:\n", + "\n", + "- The target column, `Exited` has a value of `1` when a customer has churned and `0` otherwise.\n", + "- The ValidMind Library provides a wrapper to automatically load the dataset as a Pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.datasets.classification import customer_churn as demo_dataset\n", + "\n", + "print(\n", + " f\"Loaded demo dataset with: \\n\\n\\t• Target column: '{demo_dataset.target_column}' \\n\\t• Class labels: {demo_dataset.class_labels}\"\n", + ")\n", + "\n", + "raw_df = demo_dataset.load_data()\n", + "raw_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Verifying data quality adjustments\n", + "\n", + "Let's say that thanks to the documentation submitted by the model development team ([Learn more ...](https://docs.validmind.ai/developer/validmind-library.html#for-model-development)), we know that the sample dataset was first modified before being used to train the champion model. After performing some data quality assessments on the raw dataset, it was determined that the dataset required rebalancing, and highly correlated features were also removed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Identify qualitative tests\n", + "\n", + "During model validation, we use the same data processing logic and training procedure to confirm that the model's results can be reproduced independently, so let's start by doing some data quality assessments by running a few individual tests just like the development team did.\n", + "\n", + "Use the [`vm.tests.list_tests()` function](https://docs.validmind.ai/validmind/validmind/tests.html#list_tests) introduced by the first notebook in this series in combination with [`vm.tests.list_tags()`](https://docs.validmind.ai/validmind/validmind/tests.html#list_tags) and [`vm.tests.list_tasks()`](https://docs.validmind.ai/validmind/validmind/tests.html#list_tasks) to find which prebuilt tests are relevant for data quality assessment:\n", + "\n", + "- **`tasks`** represent the kind of modeling task associated with a test. Here we'll focus on `classification` tasks.\n", + "- **`tags`** are free-form descriptions providing more details about the test, for example, what category the test falls into. Here we'll focus on the `data_quality` tag." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the list of available task types\n", + "sorted(vm.tests.list_tasks())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the list of available tags\n", + "sorted(vm.tests.list_tags())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can pass `tags` and `tasks` as parameters to the `vm.tests.list_tests()` function to filter the tests based on the tags and task types.\n", + "\n", + "For example, to find tests related to tabular data quality for classification models, you can call `list_tests()` like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.list_tests(task=\"classification\", tags=[\"tabular_data\", \"data_quality\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Want to learn more about navigating ValidMind tests?\n", + "

\n", + "Refer to our notebook outlining the utilities available for viewing and understanding available ValidMind tests: Explore tests
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind datasets\n", + "\n", + "With the individual tests we want to run identified, the next step is to connect your data with a ValidMind `Dataset` object. **This step is always necessary every time you want to connect a dataset to documentation and produce test results through ValidMind,** but you only need to do it once per dataset.\n", + "\n", + "Initialize a ValidMind dataset object using the [`init_dataset` function](https://docs.validmind.ai/validmind/validmind.html#init_dataset) from the ValidMind (`vm`) module. For this example, we'll pass in the following arguments:\n", + "\n", + "- **`dataset`** — The raw dataset that you want to provide as input to tests.\n", + "- **`input_id`** — A unique identifier that allows tracking what inputs are used when running each individual test.\n", + "- **`target_column`** — A required argument if tests require access to true values. This is the name of the target column in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# vm_raw_dataset is now a VMDataset object that you can pass to any ValidMind test\n", + "vm_raw_dataset = vm.init_dataset(\n", + " dataset=raw_df,\n", + " input_id=\"raw_dataset\",\n", + " target_column=\"Exited\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run data quality tests\n", + "\n", + "Now that we know how to initialize a ValidMind `dataset` object, we're ready to run some tests!\n", + "\n", + "You run individual tests by calling [the `run_test` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test) provided by the `validmind.tests` module. For the examples below, we'll pass in the following arguments:\n", + "\n", + "- **`test_id`** — The ID of the test to run, as seen in the `ID` column when you run `list_tests`. \n", + "- **`params`** — A dictionary of parameters for the test. These will override any `default_params` set in the test definition. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Run tabular data tests\n", + "\n", + "The inputs expected by a test can also be found in the test definition — let's take [`validmind.data_validation.DescriptiveStatistics`](https://docs.validmind.ai/tests/data_validation/DescriptiveStatistics.html) as an example.\n", + "\n", + "Note that the output of the [`describe_test()` function](https://docs.validmind.ai/validmind/validmind/tests.html#describe_test) below shows that this test expects a `dataset` as input:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.describe_test(\"validmind.data_validation.DescriptiveStatistics\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's run a few tests to assess the quality of the dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result2 = vm.tests.run_test(\n", + " test_id=\"validmind.data_validation.ClassImbalance\",\n", + " inputs={\"dataset\": vm_raw_dataset},\n", + " params={\"min_percent_threshold\": 30},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output above shows that [the class imbalance test](https://docs.validmind.ai/tests/data_validation/ClassImbalance.html) did not pass according to the value we set for `min_percent_threshold` — great, this matches what was reported by the model development team.\n", + "\n", + "To address this issue, we'll re-run the test on some processed data. In this case let's apply a very simple rebalancing technique to the dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "raw_copy_df = raw_df.sample(frac=1) # Create a copy of the raw dataset\n", + "\n", + "# Create a balanced dataset with the same number of exited and not exited customers\n", + "exited_df = raw_copy_df.loc[raw_copy_df[\"Exited\"] == 1]\n", + "not_exited_df = raw_copy_df.loc[raw_copy_df[\"Exited\"] == 0].sample(n=exited_df.shape[0])\n", + "\n", + "balanced_raw_df = pd.concat([exited_df, not_exited_df])\n", + "balanced_raw_df = balanced_raw_df.sample(frac=1, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With this new balanced dataset, you can re-run the individual test to see if it now passes the class imbalance test requirement.\n", + "\n", + "As this is technically a different dataset, **remember to first initialize a new ValidMind `Dataset` object** to pass in as input as required by `run_test()`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Register new data and now 'balanced_raw_dataset' is the new dataset object of interest\n", + "vm_balanced_raw_dataset = vm.init_dataset(\n", + " dataset=balanced_raw_df,\n", + " input_id=\"balanced_raw_dataset\",\n", + " target_column=\"Exited\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Pass the initialized `balanced_raw_dataset` as input into the test run\n", + "result = vm.tests.run_test(\n", + " test_id=\"validmind.data_validation.ClassImbalance\",\n", + " inputs={\"dataset\": vm_balanced_raw_dataset},\n", + " params={\"min_percent_threshold\": 30},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Remove highly correlated features\n", + "\n", + "Next, let's also remove highly correlated features from our dataset as outlined by the development team. Removing highly correlated features helps make the model simpler, more stable, and easier to understand.\n", + "\n", + "You can utilize the output from a ValidMind test for further use — in this below example, to retrieve the list of features with the highest correlation coefficients and use them to reduce the final list of features for modeling.\n", + "\n", + "First, we'll run [`validmind.data_validation.HighPearsonCorrelation`](https://docs.validmind.ai/tests/data_validation/HighPearsonCorrelation.html) with the `balanced_raw_dataset` we initialized previously as input as is for comparison with later runs:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "corr_result = vm.tests.run_test(\n", + " test_id=\"validmind.data_validation.HighPearsonCorrelation\",\n", + " params={\"max_threshold\": 0.3},\n", + " inputs={\"dataset\": vm_balanced_raw_dataset},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output above shows that the test did not pass according to the value we set for `max_threshold` — as reported and expected.\n", + "\n", + "`corr_result` is an object of type `TestResult`. We can inspect the result object to see what the test has produced:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(type(corr_result))\n", + "print(\"Result ID: \", corr_result.result_id)\n", + "print(\"Params: \", corr_result.params)\n", + "print(\"Passed: \", corr_result.passed)\n", + "print(\"Tables: \", corr_result.tables)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's remove the highly correlated features and create a new VM `dataset` object.\n", + "\n", + "We'll begin by checking out the table in the result and extracting a list of features that failed the test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract table from `corr_result.tables`\n", + "features_df = corr_result.tables[0].data\n", + "features_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract list of features that failed the test\n", + "high_correlation_features = features_df[features_df[\"Pass/Fail\"] == \"Fail\"][\"Columns\"].tolist()\n", + "high_correlation_features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, extract the feature names from the list of strings (example: `(Age, Exited)` > `Age`):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "high_correlation_features = [feature.split(\",\")[0].strip(\"()\") for feature in high_correlation_features]\n", + "high_correlation_features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, it's time to re-initialize the dataset with the highly correlated features removed.\n", + "\n", + "**Note the use of a different `input_id`.** This allows tracking the inputs used when running each individual test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove the highly correlated features from the dataset\n", + "balanced_raw_no_age_df = balanced_raw_df.drop(columns=high_correlation_features)\n", + "\n", + "# Re-initialize the dataset object\n", + "vm_raw_dataset_preprocessed = vm.init_dataset(\n", + " dataset=balanced_raw_no_age_df,\n", + " input_id=\"raw_dataset_preprocessed\",\n", + " target_column=\"Exited\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Re-running the test with the reduced feature set should pass the test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "corr_result = vm.tests.run_test(\n", + " test_id=\"validmind.data_validation.HighPearsonCorrelation\",\n", + " params={\"max_threshold\": 0.3},\n", + " inputs={\"dataset\": vm_raw_dataset_preprocessed},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also plot the correlation matrix to visualize the new correlation between features:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "corr_result = vm.tests.run_test(\n", + " test_id=\"validmind.data_validation.PearsonCorrelationMatrix\",\n", + " inputs={\"dataset\": vm_raw_dataset_preprocessed},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Documenting test results\n", + "\n", + "Now that we've done some analysis on two different datasets, we can use ValidMind to easily document why certain things were done to our raw data with testing to support it. As we learned above, every test result returned by the `run_test()` function has a `.log()` method that can be used to send the test results to the ValidMind Platform.\n", + "\n", + "When logging validation test results to the platform, you'll need to manually add those results to the desired section of the validation report. To demonstrate how to add test results to your validation report, we'll log our data quality tests and insert the results via the ValidMind Platform." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Configure and run comparison tests\n", + "\n", + "Below, we'll perform comparison tests between the original raw dataset (`raw_dataset`) and the final preprocessed (`raw_dataset_preprocessed`) dataset, again logging the results to the ValidMind Platform. \n", + "\n", + "We can specify all the tests we'd ike to run in a dictionary called `test_config`, and we'll pass in the following arguments for each test:\n", + "\n", + " - **`params`:** Individual test parameters.\n", + " - **`input_grid`:** Individual test inputs to compare. In this case, we'll input our two datasets for comparison.\n", + "\n", + "**Note here that the `input_grid` expects the `input_id` of the dataset as the value rather than the variable name we specified:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Individual test config with inputs specified\n", + "test_config = {\n", + " \"validmind.data_validation.ClassImbalance\": {\n", + " \"input_grid\": {\"dataset\": [\"raw_dataset\", \"raw_dataset_preprocessed\"]},\n", + " \"params\": {\"min_percent_threshold\": 30}\n", + " },\n", + " \"validmind.data_validation.HighPearsonCorrelation\": {\n", + " \"input_grid\": {\"dataset\": [\"raw_dataset\", \"raw_dataset_preprocessed\"]},\n", + " \"params\": {\"max_threshold\": 0.3}\n", + " },\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then batch run and log our tests in `test_config`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for t in test_config:\n", + " print(t)\n", + " try:\n", + " # Check if test has input_grid\n", + " if 'input_grid' in test_config[t]:\n", + " # For tests with input_grid, pass the input_grid configuration\n", + " if 'params' in test_config[t]:\n", + " vm.tests.run_test(t, input_grid=test_config[t]['input_grid'], params=test_config[t]['params']).log()\n", + " else:\n", + " vm.tests.run_test(t, input_grid=test_config[t]['input_grid']).log()\n", + " else:\n", + " # Original logic for regular inputs\n", + " if 'params' in test_config[t]:\n", + " vm.tests.run_test(t, inputs=test_config[t]['inputs'], params=test_config[t]['params']).log()\n", + " else:\n", + " vm.tests.run_test(t, inputs=test_config[t]['inputs']).log()\n", + " except Exception as e:\n", + " print(f\"Error running test {t}: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Note the output returned indicating that a test-driven block doesn't currently exist in your model's documentation for some test IDs. \n", + "

\n", + "That's expected, as when we run validations tests the results logged need to be manually added to your report as part of your compliance assessment process within the ValidMind Platform.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Log tests with a unique identifiers\n", + "\n", + "Next, we'll use the previously initialized `vm_balanced_raw_dataset` (that still has a highly correlated `Age` column) as input to run an individual test, then log the result to the ValidMind Platform.\n", + "\n", + "When running individual tests, **you can use a custom `result_id` to tag the individual result with a unique identifier:**\n", + "\n", + "- This `result_id` can be appended to `test_id` with a `:` separator.\n", + "- The `balanced_raw_dataset` result identifier will correspond to the `balanced_raw_dataset` input, the dataset that still has the `Age` column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = vm.tests.run_test(\n", + " test_id=\"validmind.data_validation.HighPearsonCorrelation:balanced_raw_dataset\",\n", + " params={\"max_threshold\": 0.3},\n", + " inputs={\"dataset\": vm_balanced_raw_dataset},\n", + ")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Add test results to reporting\n", + "\n", + "With some test results logged, let's head to the model we connected to at the beginning of this notebook and learn how to insert a test result into our validation report ([Need more help?](https://docs.validmind.ai/guide/model-validation/assess-compliance.html#link-validator-evidence)).\n", + "\n", + "While the example below focuses on a specific test result, you can follow the same general procedure for your other results:\n", + "\n", + "1. From the **Inventory** in the ValidMind Platform, go to the model you connected to earlier.\n", + "\n", + "2. In the left sidebar that appears for your model, click **Validation Report**.\n", + "\n", + "3. Locate the Data Preparation section and click on **2.2.1. Data Quality** to expand that section.\n", + "\n", + "4. Under the Class Imbalance Assessment section, locate Validator Evidence then click **Link Evidence to Report**:\n", + "\n", + " \"Screenshot\n", + "

\n", + "\n", + "5. Select the Class Imbalance test results we logged: **ValidMind Data Validation Class Imbalance** \n", + "\n", + " \"Screenshot\n", + "

\n", + "\n", + "6. Click **Update Linked Evidence** to add the test results to the validation report.\n", + "\n", + " Confirm that the results for the Class Imbalance test you inserted has been correctly inserted into section **2.2.1. Data Quality** of the report:\n", + "\n", + " \"Screenshot\n", + "

\n", + "\n", + "7. Note that these test results are flagged as **Requires Attention** — as they include comparative results from our initial raw dataset.\n", + "\n", + " Click **See evidence details** to review the LLM-generated description that summarizes the test results, that confirm that our final preprocessed dataset actually passes our test:\n", + "\n", + " \"Screenshot\n", + "

\n", + "\n", + "\n", + "
Here in this text editor, you can make qualitative edits to the draft that ValidMind generated to finalize the test results.\n", + "

\n", + "Learn more: Work with content blocks
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Split the preprocessed dataset\n", + "\n", + "With our raw dataset rebalanced with highly correlated features removed, let's now **spilt our dataset into train and test** in preparation for model evaluation testing.\n", + "\n", + "To start, let's grab the first few rows from the `balanced_raw_no_age_df` dataset we initialized earlier:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "balanced_raw_no_age_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before training the model, we need to encode the categorical features in the dataset:\n", + "\n", + "- Use the `OneHotEncoder` class from the `sklearn.preprocessing` module to encode the categorical features.\n", + "- The categorical features in the dataset are `Geography` and `Gender`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "balanced_raw_no_age_df = pd.get_dummies(\n", + " balanced_raw_no_age_df, columns=[\"Geography\", \"Gender\"], drop_first=True\n", + ")\n", + "balanced_raw_no_age_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Splitting our dataset into training and testing is essential for proper validation testing, as this helps assess how well the model generalizes to unseen data:\n", + "\n", + "- We start by dividing our `balanced_raw_no_age_df` dataset into training and test subsets using `train_test_split`, with 80% of the data allocated to training (`train_df`) and 20% to testing (`test_df`).\n", + "- From each subset, we separate the features (all columns except \"Exited\") into `X_train` and `X_test`, and the target column (\"Exited\") into `y_train` and `y_test`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "train_df, test_df = train_test_split(balanced_raw_no_age_df, test_size=0.20)\n", + "\n", + "X_train = train_df.drop(\"Exited\", axis=1)\n", + "y_train = train_df[\"Exited\"]\n", + "X_test = test_df.drop(\"Exited\", axis=1)\n", + "y_test = test_df[\"Exited\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the split datasets\n", + "\n", + "Next, let's initialize the training and testing datasets so they are available for use:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds = vm.init_dataset(\n", + " input_id=\"train_dataset_final\",\n", + " dataset=train_df,\n", + " target_column=\"Exited\",\n", + ")\n", + "\n", + "vm_test_ds = vm.init_dataset(\n", + " input_id=\"test_dataset_final\",\n", + " dataset=test_df,\n", + " target_column=\"Exited\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## In summary\n", + "\n", + "In this second notebook, you learned how to:\n", + "\n", + "- [x] Import a sample dataset\n", + "- [x] Identify which tests you might want to run with ValidMind\n", + "- [x] Initialize ValidMind datasets\n", + "- [x] Run individual tests\n", + "- [x] Utilize the output from tests you’ve run\n", + "- [x] Log test results as evidence to the ValidMind Platform\n", + "- [x] Insert test results into your validation report" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps\n", + "\n", + "\n", + "\n", + "### Develop potential challenger models\n", + "\n", + "Now that you're familiar with the basics of using the ValidMind Library, let's use it to develop a challenger model: **[3 — Developing a potential challenger model](3-developing_challenger_model.ipynb)**" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ValidMind Library", + "language": "python", + "name": "validmind" + }, + "language_info": { + "name": "python", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/tutorials/model_validation/3-developing_challenger_model.ipynb b/site/notebooks/tutorials/model_validation/3-developing_challenger_model.ipynb new file mode 100644 index 0000000000..b0d2260128 --- /dev/null +++ b/site/notebooks/tutorials/model_validation/3-developing_challenger_model.ipynb @@ -0,0 +1,871 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ValidMind for model validation 3 — Developing a potential challenger model\n", + "\n", + "Learn how to use ValidMind for your end-to-end model validation process with our series of four introductory notebooks. In this third notebook, develop a potential challenger model and then pass your model and its predictions to ValidMind.\n", + "\n", + "A *challenger model* is an alternate model that attempt to outperform the champion model, ensuring that the best performing fit-for-purpose model is always considered for deployment. Challenger models also help avoid over-reliance on a single model, and allow testing of new features, algorithms, or data sources without disrupting the production lifecycle." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [Prerequisites](#toc1_) \n", + "- [Setting up](#toc2_) \n", + " - [Initialize the ValidMind Library](#toc2_1_) \n", + " - [Import the sample dataset](#toc2_2_) \n", + " - [Preprocess the dataset](#toc2_2_1_) \n", + " - [Split the preprocessed dataset](#toc2_3_) \n", + "- [Import the champion model](#toc3_) \n", + "- [Training a potential challenger model](#toc4_) \n", + " - [Random forest classification model](#toc4_1_) \n", + "- [Initializing the model objects](#toc5_) \n", + " - [Initialize the model objects](#toc5_1_) \n", + " - [Assign predictions](#toc5_2_) \n", + "- [Running model validation tests](#toc6_) \n", + " - [Run model performance tests](#toc6_1_) \n", + " - [Evaluate performance of the champion model](#toc6_1_1_) \n", + " - [Log a model finding](#toc6_1_2_) \n", + " - [Evaluate performance of challenger model](#toc6_1_3_) \n", + " - [Run diagnostic tests](#toc6_2_) \n", + " - [Run feature importance tests](#toc6_3_) \n", + "- [In summary](#toc7_) \n", + "- [Next steps](#toc8_) \n", + " - [Finalize validation and reporting](#toc8_1_) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Prerequisites\n", + "\n", + "In order to develop potential challenger models with this notebook, you'll need to first have:\n", + "\n", + "- [x] Registered a model within the ValidMind Platform and granted yourself access to the model as a validator\n", + "- [x] Installed the ValidMind Library in your local environment, allowing you to access all its features\n", + "- [x] Learned how to import and initialize datasets for use with ValidMind\n", + "- [x] Understood the basics of how to run and log tests with ValidMind\n", + "- [x] Run data quality tests on the datasets used to train the champion model, and logged the results of those tests to ValidMind\n", + "- [x] Inserted your logged test results into your validation report\n", + "\n", + "
Need help with the above steps?\n", + "

\n", + "Refer to the first two notebooks in this series:\n", + "\n", + "- 1 — Set up the ValidMind Library for validation\n", + "- 2 — Start the model validation process\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up\n", + "\n", + "This section should be quite familiar to you — as we performed the same actions in the previous notebook, **[2 — Start the model validation process](2-start_validation_process.ipynb)**." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library\n", + "\n", + "As usual, let's first connect up the ValidMind Library to our model we previously registered in the ValidMind Platform:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and select the model you registered for this \"ValidMind for model validation\" series of notebooks.\n", + "\n", + "3. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Make sure the ValidMind Library is installed\n", + "\n", + "%pip install -q validmind\n", + "\n", + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Import the sample dataset\n", + "\n", + "Next, we'll load in the sample [Bank Customer Churn Prediction](https://www.kaggle.com/datasets/shantanudhakadd/bank-customer-churn-prediction) dataset used to develop the champion model that we will independently preprocess:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the sample dataset\n", + "from validmind.datasets.classification import customer_churn as demo_dataset\n", + "\n", + "print(\n", + " f\"Loaded demo dataset with: \\n\\n\\t• Target column: '{demo_dataset.target_column}' \\n\\t• Class labels: {demo_dataset.class_labels}\"\n", + ")\n", + "\n", + "raw_df = demo_dataset.load_data()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Preprocess the dataset\n", + "\n", + "We’ll apply a simple rebalancing technique to the dataset before continuing:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "raw_copy_df = raw_df.sample(frac=1) # Create a copy of the raw dataset\n", + "\n", + "# Create a balanced dataset with the same number of exited and not exited customers\n", + "exited_df = raw_copy_df.loc[raw_copy_df[\"Exited\"] == 1]\n", + "not_exited_df = raw_copy_df.loc[raw_copy_df[\"Exited\"] == 0].sample(n=exited_df.shape[0])\n", + "\n", + "balanced_raw_df = pd.concat([exited_df, not_exited_df])\n", + "balanced_raw_df = balanced_raw_df.sample(frac=1, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s also quickly remove highly correlated features from the dataset using the output from a ValidMind test.\n", + "\n", + "As you know, before we can run tests you’ll need to initialize a ValidMind dataset object with the [`init_dataset` function](https://docs.validmind.ai/validmind/validmind.html#init_dataset):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Register new data and now 'balanced_raw_dataset' is the new dataset object of interest\n", + "vm_balanced_raw_dataset = vm.init_dataset(\n", + " dataset=balanced_raw_df,\n", + " input_id=\"balanced_raw_dataset\",\n", + " target_column=\"Exited\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With our balanced dataset initialized, we can then run our test and utilize the output to help us identify the features we want to remove:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run HighPearsonCorrelation test with our balanced dataset as input and return a result object\n", + "corr_result = vm.tests.run_test(\n", + " test_id=\"validmind.data_validation.HighPearsonCorrelation\",\n", + " params={\"max_threshold\": 0.3},\n", + " inputs={\"dataset\": vm_balanced_raw_dataset},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# From result object, extract table from `corr_result.tables`\n", + "features_df = corr_result.tables[0].data\n", + "features_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract list of features that failed the test\n", + "high_correlation_features = features_df[features_df[\"Pass/Fail\"] == \"Fail\"][\"Columns\"].tolist()\n", + "high_correlation_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract feature names from the list of strings\n", + "high_correlation_features = [feature.split(\",\")[0].strip(\"()\") for feature in high_correlation_features]\n", + "high_correlation_features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then re-initialize the dataset with a different `input_id` and the highly correlated features removed and re-run the test for confirmation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove the highly correlated features from the dataset\n", + "balanced_raw_no_age_df = balanced_raw_df.drop(columns=high_correlation_features)\n", + "\n", + "# Re-initialize the dataset object\n", + "vm_raw_dataset_preprocessed = vm.init_dataset(\n", + " dataset=balanced_raw_no_age_df,\n", + " input_id=\"raw_dataset_preprocessed\",\n", + " target_column=\"Exited\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Re-run the test with the reduced feature set\n", + "corr_result = vm.tests.run_test(\n", + " test_id=\"validmind.data_validation.HighPearsonCorrelation\",\n", + " params={\"max_threshold\": 0.3},\n", + " inputs={\"dataset\": vm_raw_dataset_preprocessed},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Split the preprocessed dataset\n", + "\n", + "With our raw dataset rebalanced with highly correlated features removed, let's now **spilt our dataset into train and test** in preparation for model evaluation testing:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Encode categorical features in the dataset\n", + "balanced_raw_no_age_df = pd.get_dummies(\n", + " balanced_raw_no_age_df, columns=[\"Geography\", \"Gender\"], drop_first=True\n", + ")\n", + "balanced_raw_no_age_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Split the dataset into train and test\n", + "train_df, test_df = train_test_split(balanced_raw_no_age_df, test_size=0.20)\n", + "\n", + "X_train = train_df.drop(\"Exited\", axis=1)\n", + "y_train = train_df[\"Exited\"]\n", + "X_test = test_df.drop(\"Exited\", axis=1)\n", + "y_test = test_df[\"Exited\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the split datasets\n", + "vm_train_ds = vm.init_dataset(\n", + " input_id=\"train_dataset_final\",\n", + " dataset=train_df,\n", + " target_column=\"Exited\",\n", + ")\n", + "\n", + "vm_test_ds = vm.init_dataset(\n", + " input_id=\"test_dataset_final\",\n", + " dataset=test_df,\n", + " target_column=\"Exited\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Import the champion model\n", + "\n", + "With our raw dataset assessed and preprocessed, let's go ahead and import the champion model submitted by the model development team in the format of a `.pkl` file: **[lr_model_champion.pkl](lr_model_champion.pkl)**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the champion model\n", + "import pickle as pkl\n", + "\n", + "with open(\"lr_model_champion.pkl\", \"rb\") as f:\n", + " log_reg = pkl.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Training a potential challenger model\n", + "\n", + "We're curious how an alternate model compares to our champion model, so let's train a challenger model as a basis for our testing.\n", + "\n", + "Our champion *logistic regression model* is a simpler, parametric model that assumes a linear relationship between the independent variables and the log-odds of the outcome. While logistic regression may not capture complex patterns as effectively, it offers a high degree of interpretability and is easier to explain to stakeholders. However, model risk is not calculated in isolation from a single factor, but rather in consideration with trade-offs in predictive performance, ease of interpretability, and overall alignment with business objectives." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Random forest classification model\n", + "\n", + "A *random forest classification model* is an ensemble machine learning algorithm that uses multiple decision trees to classify data. In ensemble learning, multiple models are combined to improve prediction accuracy and robustness.\n", + "\n", + "Random forest classification models generally have higher accuracy because they capture complex, non-linear relationships, but as a result they lack transparency in their predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the Random Forest Classification model\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "# Create the model instance with 50 decision trees\n", + "rf_model = RandomForestClassifier(\n", + " n_estimators=50,\n", + " random_state=42,\n", + ")\n", + "\n", + "# Train the model\n", + "rf_model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Initializing the model objects" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the model objects\n", + "\n", + "In addition to the initialized datasets, you'll also need to initialize a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data for each of our two models.\n", + "\n", + "You simply initialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the champion logistic regression model\n", + "vm_log_model = vm.init_model(\n", + " log_reg,\n", + " input_id=\"log_model_champion\",\n", + ")\n", + "\n", + "# Initialize the challenger random forest classification model\n", + "vm_rf_model = vm.init_model(\n", + " rf_model,\n", + " input_id=\"rf_model\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign predictions\n", + "\n", + "With our models registered, we'll move on to assigning both the predictive probabilities coming directly from each model's predictions, and the binary prediction after applying the cutoff threshold described in the Compute binary predictions step above.\n", + "\n", + "- The [`assign_predictions()` method](https://docs.validmind.ai/validmind/validmind/vm_models.html#assign_predictions) from the `Dataset` object can link existing predictions to any number of models.\n", + "- This method links the model's class prediction values and probabilities to our `vm_train_ds` and `vm_test_ds` datasets.\n", + "\n", + "If no prediction values are passed, the method will compute predictions automatically:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Champion — Logistic regression model\n", + "vm_train_ds.assign_predictions(model=vm_log_model)\n", + "vm_test_ds.assign_predictions(model=vm_log_model)\n", + "\n", + "# Challenger — Random forest classification model\n", + "vm_train_ds.assign_predictions(model=vm_rf_model)\n", + "vm_test_ds.assign_predictions(model=vm_rf_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Running model validation tests\n", + "\n", + "With everything ready for us, let's run the rest of our validation tests. We'll focus on comprehensive testing around model performance of both the champion and challenger models going forward as we've already verified the data quality of the datasets used to train the champion model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run model performance tests\n", + "\n", + "Let's run some performance tests, beginning with independent testing of our champion logistic regression model, then moving on to our potential challenger model.\n", + "\n", + "Use [`vm.tests.list_tests()`](https://docs.validmind.ai/validmind/validmind/tests.html#list_tests) to identify all the model performance tests for classification:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "vm.tests.list_tests(tags=[\"model_performance\"], task=\"classification\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll isolate the specific tests we want to run in `mpt`:\n", + "\n", + "- [`ClassifierPerformance`](https://docs.validmind.ai/tests/model_validation/sklearn/ClassifierPerformance.html)\n", + "- [`ConfusionMatrix`](https://docs.validmind.ai/tests/model_validation/sklearn/ConfusionMatrix.html)\n", + "- [`MinimumAccuracy`](https://docs.validmind.ai/tests/model_validation/sklearn/MinimumAccuracy.html)\n", + "- [`MinimumF1Score`](https://docs.validmind.ai/tests/model_validation/sklearn/MinimumF1Score.html)\n", + "- [`ROCCurve`](https://docs.validmind.ai/tests/model_validation/sklearn/ROCCurve.html)\n", + "\n", + "As we learned in the previous notebook [2 — Start the model validation process](2-start_validation_process.ipynb), you can use a custom `result_id` to tag the individual result with a unique identifier by appending this `result_id` to the `test_id` with a `:` separator. We'll append an identifier for our champion model here:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mpt = [\n", + " \"validmind.model_validation.sklearn.ClassifierPerformance:logreg_champion\",\n", + " \"validmind.model_validation.sklearn.ConfusionMatrix:logreg_champion\",\n", + " \"validmind.model_validation.sklearn.MinimumAccuracy:logreg_champion\",\n", + " \"validmind.model_validation.sklearn.MinimumF1Score:logreg_champion\",\n", + " \"validmind.model_validation.sklearn.ROCCurve:logreg_champion\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Evaluate performance of the champion model\n", + "\n", + "Now, let's run and log our batch of model performance tests using our testing dataset (`vm_test_ds`) for our champion model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for test in mpt:\n", + " vm.tests.run_test(\n", + " test,\n", + " inputs={\n", + " \"dataset\": vm_test_ds, \"model\" : vm_log_model,\n", + " },\n", + " ).log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Note the output returned indicating that a test-driven block doesn't currently exist in your model's documentation for some test IDs. \n", + "

\n", + "That's expected, as when we run validations tests the results logged need to be manually added to your report as part of your compliance assessment process within the ValidMind Platform.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Log a model finding\n", + "\n", + "As we can observe from the output above, our champion model doesn't pass the `MinimumAccuracy` based on the default thresholds of the out-of-the-box test, so let's log a model finding in the ValidMind Platform ([Need more help?](https://docs.validmind.ai/guide/model-validation/add-manage-model-findings.html)):\n", + "\n", + "1. From the **Inventory** in the ValidMind Platform, go to the model you connected to earlier.\n", + "\n", + "2. In the left sidebar that appears for your model, click **Validation Report**.\n", + "\n", + "3. Locate the Data Preparation section and click on **2.2.2. Model Performance** to expand that section.\n", + "\n", + "4. Under the Model Performance Metrics section, locate Findings then click **Link Finding to Report**:\n", + "\n", + " \"Screenshot\n", + "

\n", + "\n", + "5. Click **+ Create New Finding** to add a finding.\n", + "\n", + "6. Enter in the details for your finding, for example:\n", + "\n", + " - **TITLE** — Champion Logistic Regression Model Fails Minimum Accuracy Threshold\n", + " - **RISK AREA** — Model Performance\n", + " - **DOCUMENTATION SECTION** — 3.2. Model Evaluation\n", + " - **DESCRIPTION** — The logistic regression champion model was subjected to a Minimum Accuracy test to determine whether its predictive accuracy meets the predefined performance threshold of 0.7. The model achieved an accuracy score of 0.6136, which falls below the required minimum. As a result, the test produced a Fail outcome.\n", + "\n", + "7. Click **Save**.\n", + "\n", + "8. Select the finding you just added to link to your validation report:\n", + "\n", + " \"Screenshot\n", + "

\n", + "\n", + "9. Click **Update Linked Findings** to insert your finding.\n", + "\n", + "10. Confirm that finding you inserted has been correctly inserted into section **2.2.2. Model Performance** of the report:\n", + "\n", + " \"Screenshot\n", + "

\n", + "\n", + "11. Click on the finding to expand the finding, where you can adjust details such as severity, owner, due date, status, etc. as well as include proposed remediation plans or supporting documentation as attachments." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Evaluate performance of challenger model\n", + "\n", + "We've now conducted similar tests as the model development team for our champion model, with the aim of verifying their test results.\n", + "\n", + "Next, let's see how our challenger models compare. We'll use the same batch of tests here as we did in `mpt`, but append a different `result_id` to indicate that these results should be associated with our challenger model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mpt_chall = [\n", + " \"validmind.model_validation.sklearn.ClassifierPerformance:champion_vs_challenger\",\n", + " \"validmind.model_validation.sklearn.ConfusionMatrix:champion_vs_challenger\",\n", + " \"validmind.model_validation.sklearn.MinimumAccuracy:champion_vs_challenger\",\n", + " \"validmind.model_validation.sklearn.MinimumF1Score:champion_vs_challenger\",\n", + " \"validmind.model_validation.sklearn.ROCCurve:champion_vs_challenger\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll run each test once for each model with the same `vm_test_ds` dataset to compare them:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for test in mpt_chall:\n", + " vm.tests.run_test(\n", + " test,\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds], \"model\" : [vm_log_model,vm_rf_model]\n", + " }\n", + " ).log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Based on the performance metrics, our challenger random forest classification model passes the MinimumAccuracy where our champion did not.\n", + "

\n", + "In your validation report, support your recommendation in your finding's Proposed Remediation Plan to investigate the usage of our challenger model by inserting the performance tests we logged with this notebook into the appropriate section.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run diagnostic tests\n", + "\n", + "Next we want to inspect the robustness and stability testing comparison between our champion and challenger model.\n", + "\n", + "Use `list_tests()` to identify all the model diagnosis tests for classification:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.list_tests(tags=[\"model_diagnosis\"], task=\"classification\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see if models suffer from any *overfit* potentials and also where there are potential sub-segments of issues with the [`OverfitDiagnosis` test](https://docs.validmind.ai/tests/model_validation/sklearn/OverfitDiagnosis.html). \n", + "\n", + "Overfitting occurs when a model learns the training data too well, capturing not only the true pattern but noise and random fluctuations resulting in excellent performance on the training dataset but poor generalization to new, unseen data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " test_id=\"validmind.model_validation.sklearn.OverfitDiagnosis:champion_vs_challenger\",\n", + " input_grid={\n", + " \"datasets\": [[vm_train_ds,vm_test_ds]],\n", + " \"model\" : [vm_log_model,vm_rf_model]\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's also conduct *robustness* and *stability* testing of the two models with the [`RobustnessDiagnosis` test](https://docs.validmind.ai/tests/model_validation/sklearn/RobustnessDiagnosis.html).\n", + "\n", + "Robustness refers to a model's ability to maintain consistent performance, and stability refers to a model's ability to produce consistent outputs over time across different data subsets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " test_id=\"validmind.model_validation.sklearn.RobustnessDiagnosis:Champion_vs_LogRegression\",\n", + " input_grid={\n", + " \"datasets\": [[vm_train_ds,vm_test_ds]],\n", + " \"model\" : [vm_log_model,vm_rf_model]\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run feature importance tests\n", + "\n", + "We also want to verify the relative influence of different input features on our models' predictions, as well as inspect the differences between our champion and challenger model to see if a certain model offers more understandable or logical importance scores for features.\n", + "\n", + "Use `list_tests()` to identify all the feature importance tests for classification:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Store the feature importance tests\n", + "FI = vm.tests.list_tests(tags=[\"feature_importance\"], task=\"classification\",pretty=False)\n", + "FI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run and log our feature importance tests for both models for the testing dataset\n", + "for test in FI:\n", + " vm.tests.run_test(\n", + " \"\".join((test,':champion_vs_challenger')),\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds], \"model\" : [vm_log_model,vm_rf_model]\n", + " },\n", + " ).log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## In summary\n", + "\n", + "In this third notebook, you learned how to:\n", + "\n", + "- [x] Initialize ValidMind model objects\n", + "- [x] Assign predictions and probabilities to your ValidMind model objects\n", + "- [x] Use tests from ValidMind to evaluate the potential of models, including comparative tests between champion and challenger models\n", + "- [x] Log a model finding in the ValidMind Platform" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps\n", + "\n", + "\n", + "\n", + "### Finalize validation and reporting\n", + "\n", + "Now that you're familiar with the basics of using the ValidMind Library to run and log validation tests, let's learn how to implement some custom tests and wrap up our validation: **[4 — Finalize validation and reporting](4-finalize_validation_reporting.ipynb)**" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ValidMind Library", + "language": "python", + "name": "validmind" + }, + "language_info": { + "name": "python", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb b/site/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb new file mode 100644 index 0000000000..1e5561c514 --- /dev/null +++ b/site/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb @@ -0,0 +1,1207 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ValidMind for model validation 4 — Finalize testing and reporting\n", + "\n", + "Learn how to use ValidMind for your end-to-end model validation process with our series of four introductory notebooks. In this last notebook, finalize the compliance assessment process and have a complete validation report ready for review.\n", + "\n", + "This notebook will walk you through how to supplement ValidMind tests with your own custom tests and include them as additional evidence in your validation report. A custom test is any function that takes a set of inputs and parameters as arguments and returns one or more outputs:\n", + "\n", + "- The function can be as simple or as complex as you need it to be — it can use external libraries, make API calls, or do anything else that you can do in Python.\n", + "- The only requirement is that the function signature and return values can be \"understood\" and handled by the ValidMind Library. As such, custom tests offer added flexibility by extending the default tests provided by ValidMind, enabling you to document any type of model or use case.\n", + "\n", + "**For a more in-depth introduction to custom tests,** refer to our [Implement custom tests](../../code_samples/custom_tests/implement_custom_tests.ipynb) notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [Prerequisites](#toc1_) \n", + "- [Setting up](#toc2_) \n", + " - [Initialize the ValidMind Library](#toc2_1_) \n", + " - [Import the sample dataset](#toc2_2_) \n", + " - [Split the preprocessed dataset](#toc2_3_) \n", + " - [Import the champion model](#toc2_4_) \n", + " - [Train potential challenger model](#toc2_5_) \n", + " - [Initialize the model objects](#toc2_6_) \n", + "- [Implementing custom tests](#toc3_) \n", + " - [Implement a custom inline test](#toc3_1_) \n", + " - [Create a confusion matrix plot](#toc3_1_1_) \n", + " - [Add parameters to custom tests](#toc3_1_2_) \n", + " - [Pass parameters to custom tests](#toc3_1_3_) \n", + " - [Use external test providers](#toc3_2_) \n", + " - [Create custom tests folder](#toc3_2_1_) \n", + " - [Save an inline test](#toc3_2_2_) \n", + " - [Register a local test provider](#toc3_2_3_) \n", + "- [Verify test runs](#toc4_) \n", + "- [In summary](#toc5_) \n", + "- [Next steps](#toc6_) \n", + " - [Work with your validation report](#toc6_1_) \n", + " - [Learn more](#toc6_2_) \n", + " - [More how-to guides and code samples](#toc6_2_1_) \n", + " - [Discover more learning resources](#toc6_2_2_) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Prerequisites\n", + "\n", + "In order to finalize validation and reporting, you'll need to first have:\n", + "\n", + "- [x] Registered a model within the ValidMind Platform and granted yourself access to the model as a validator\n", + "- [x] Installed the ValidMind Library in your local environment, allowing you to access all its features\n", + "- [x] Learned how to import and initialize datasets and models for use with ValidMind\n", + "- [x] Understood the basics of how to identify and run validation tests\n", + "- [x] Run validation tests for your champion and challenger models, and logged the results of those tests to the ValidMind Platform\n", + "- [x] Inserted your logged test results into your validation report\n", + "- [x] Added some preliminary findings to your validation report\n", + "\n", + "
Need help with the above steps?\n", + "

\n", + "Refer to the first three notebooks in this series:\n", + "\n", + "- 1 — Set up the ValidMind Library for validation\n", + "- 2 — Start the model validation process\n", + "- 2 — Developing a potential challenger model\n", + "\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up\n", + "\n", + "This section should be very familiar to you now — as we performed the same actions in the previous two notebooks in this series." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library\n", + "\n", + "As usual, let's first connect up the ValidMind Library to our model we previously registered in the ValidMind Platform:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and select the model you registered for this \"ValidMind for model validation\" series of notebooks.\n", + "\n", + "3. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Make sure the ValidMind Library is installed\n", + "\n", + "%pip install -q validmind\n", + "\n", + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Import the sample dataset\n", + "\n", + "Next, we'll load in the same sample [Bank Customer Churn Prediction](https://www.kaggle.com/datasets/shantanudhakadd/bank-customer-churn-prediction) dataset used to develop the champion model that we will independently preprocess:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the sample dataset\n", + "from validmind.datasets.classification import customer_churn as demo_dataset\n", + "\n", + "print(\n", + " f\"Loaded demo dataset with: \\n\\n\\t• Target column: '{demo_dataset.target_column}' \\n\\t• Class labels: {demo_dataset.class_labels}\"\n", + ")\n", + "\n", + "raw_df = demo_dataset.load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the raw dataset for use in ValidMind tests\n", + "vm_raw_dataset = vm.init_dataset(\n", + " dataset=raw_df,\n", + " input_id=\"raw_dataset\",\n", + " target_column=\"Exited\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "raw_copy_df = raw_df.sample(frac=1) # Create a copy of the raw dataset\n", + "\n", + "# Create a balanced dataset with the same number of exited and not exited customers\n", + "exited_df = raw_copy_df.loc[raw_copy_df[\"Exited\"] == 1]\n", + "not_exited_df = raw_copy_df.loc[raw_copy_df[\"Exited\"] == 0].sample(n=exited_df.shape[0])\n", + "\n", + "balanced_raw_df = pd.concat([exited_df, not_exited_df])\n", + "balanced_raw_df = balanced_raw_df.sample(frac=1, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s also quickly remove highly correlated features from the dataset using the output from a ValidMind test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Register new data and now 'balanced_raw_dataset' is the new dataset object of interest\n", + "vm_balanced_raw_dataset = vm.init_dataset(\n", + " dataset=balanced_raw_df,\n", + " input_id=\"balanced_raw_dataset\",\n", + " target_column=\"Exited\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run HighPearsonCorrelation test with our balanced dataset as input and return a result object\n", + "corr_result = vm.tests.run_test(\n", + " test_id=\"validmind.data_validation.HighPearsonCorrelation\",\n", + " params={\"max_threshold\": 0.3},\n", + " inputs={\"dataset\": vm_balanced_raw_dataset},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# From result object, extract table from `corr_result.tables`\n", + "features_df = corr_result.tables[0].data\n", + "features_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract list of features that failed the test\n", + "high_correlation_features = features_df[features_df[\"Pass/Fail\"] == \"Fail\"][\"Columns\"].tolist()\n", + "high_correlation_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract feature names from the list of strings\n", + "high_correlation_features = [feature.split(\",\")[0].strip(\"()\") for feature in high_correlation_features]\n", + "high_correlation_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove the highly correlated features from the dataset\n", + "balanced_raw_no_age_df = balanced_raw_df.drop(columns=high_correlation_features)\n", + "\n", + "# Re-initialize the dataset object\n", + "vm_raw_dataset_preprocessed = vm.init_dataset(\n", + " dataset=balanced_raw_no_age_df,\n", + " input_id=\"raw_dataset_preprocessed\",\n", + " target_column=\"Exited\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Re-run the test with the reduced feature set\n", + "corr_result = vm.tests.run_test(\n", + " test_id=\"validmind.data_validation.HighPearsonCorrelation\",\n", + " params={\"max_threshold\": 0.3},\n", + " inputs={\"dataset\": vm_raw_dataset_preprocessed},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Split the preprocessed dataset\n", + "\n", + "With our raw dataset rebalanced with highly correlated features removed, let's now **spilt our dataset into train and test** in preparation for model evaluation testing:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Encode categorical features in the dataset\n", + "balanced_raw_no_age_df = pd.get_dummies(\n", + " balanced_raw_no_age_df, columns=[\"Geography\", \"Gender\"], drop_first=True\n", + ")\n", + "balanced_raw_no_age_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Split the dataset into train and test\n", + "train_df, test_df = train_test_split(balanced_raw_no_age_df, test_size=0.20)\n", + "\n", + "X_train = train_df.drop(\"Exited\", axis=1)\n", + "y_train = train_df[\"Exited\"]\n", + "X_test = test_df.drop(\"Exited\", axis=1)\n", + "y_test = test_df[\"Exited\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the split datasets\n", + "vm_train_ds = vm.init_dataset(\n", + " input_id=\"train_dataset_final\",\n", + " dataset=train_df,\n", + " target_column=\"Exited\",\n", + ")\n", + "\n", + "vm_test_ds = vm.init_dataset(\n", + " input_id=\"test_dataset_final\",\n", + " dataset=test_df,\n", + " target_column=\"Exited\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Import the champion model\n", + "\n", + "With our raw dataset assessed and preprocessed, let's go ahead and import the champion model submitted by the model development team in the format of a `.pkl` file: **[lr_model_champion.pkl](lr_model_champion.pkl)**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the champion model\n", + "import pickle as pkl\n", + "\n", + "with open(\"lr_model_champion.pkl\", \"rb\") as f:\n", + " log_reg = pkl.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Train potential challenger model\n", + "\n", + "We'll also train our random forest classification challenger model to see how it compares:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the Random Forest Classification model\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "# Create the model instance with 50 decision trees\n", + "rf_model = RandomForestClassifier(\n", + " n_estimators=50,\n", + " random_state=42,\n", + ")\n", + "\n", + "# Train the model\n", + "rf_model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the model objects\n", + "\n", + "In addition to the initialized datasets, you'll also need to initialize a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data for each of our two models:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the champion logistic regression model\n", + "vm_log_model = vm.init_model(\n", + " log_reg,\n", + " input_id=\"log_model_champion\",\n", + ")\n", + "\n", + "# Initialize the challenger random forest classification model\n", + "vm_rf_model = vm.init_model(\n", + " rf_model,\n", + " input_id=\"rf_model\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Assign predictions to Champion — Logistic regression model\n", + "vm_train_ds.assign_predictions(model=vm_log_model)\n", + "vm_test_ds.assign_predictions(model=vm_log_model)\n", + "\n", + "# Assign predictions to Challenger — Random forest classification model\n", + "vm_train_ds.assign_predictions(model=vm_rf_model)\n", + "vm_test_ds.assign_predictions(model=vm_rf_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Implementing custom tests\n", + "\n", + "Thanks to the model documentation ([Learn more ...](https://docs.validmind.ai/developer/validmind-library.html#for-model-development)), we know that the model development team implemented a custom test to further evaluate the performance of the champion model.\n", + "\n", + "In a usual model validation situation, you would load a saved custom test provided by the model development team. In the following section, we'll have you implement the same custom test and make it available for reuse, to familiarize you with the processes.\n", + "\n", + "
Want to learn more about custom tests?\n", + "

\n", + "Refer to our in-depth introduction to custom tests: Implement custom tests
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Implement a custom inline test\n", + "\n", + "Let's implement the same custom *inline test* that calculates the confusion matrix for a binary classification model that the model development team used in their performance evaluations.\n", + "\n", + "- An inline test refers to a test written and executed within the same environment as the code being tested — in this case, right in this Jupyter Notebook — without requiring a separate test file or framework.\n", + "- You'll note that the custom test function is just a regular Python function that can include and require any Python library as you see fit." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Create a confusion matrix plot\n", + "\n", + "Let's first create a confusion matrix plot using the `confusion_matrix` function from the `sklearn.metrics` module:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn import metrics\n", + "\n", + "# Get the predicted classes\n", + "y_pred = log_reg.predict(vm_test_ds.x)\n", + "\n", + "confusion_matrix = metrics.confusion_matrix(y_test, y_pred)\n", + "\n", + "cm_display = metrics.ConfusionMatrixDisplay(\n", + " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", + ")\n", + "cm_display.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, create a [`@vm.test` wrapper](https://docs.validmind.ai/validmind/validmind.html#test) that will allow you to create a reusable test. **Note the following changes in the code below:**\n", + "\n", + "- The function `confusion_matrix` takes two arguments `dataset` and `model`. This is a `VMDataset` and `VMModel` object respectively.\n", + " - `VMDataset` objects allow you to access the dataset's true (target) values by accessing the `.y` attribute.\n", + " - `VMDataset` objects allow you to access the predictions for a given model by accessing the `.y_pred()` method.\n", + "- The function docstring provides a description of what the test does. This will be displayed along with the result in this notebook as well as in the ValidMind Platform.\n", + "- The function body calculates the confusion matrix using the `sklearn.metrics.confusion_matrix` function as we just did above.\n", + "- The function then returns the `ConfusionMatrixDisplay.figure_` object — this is important as the ValidMind Library expects the output of the custom test to be a plot or a table.\n", + "- The `@vm.test` decorator is doing the work of creating a wrapper around the function that will allow it to be run by the ValidMind Library. It also registers the test so it can be found by the ID `my_custom_tests.ConfusionMatrix`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.ConfusionMatrix\")\n", + "def confusion_matrix(dataset, model):\n", + " \"\"\"The confusion matrix is a table that is often used to describe the performance of a classification model on a set of data for which the true values are known.\n", + "\n", + " The confusion matrix is a 2x2 table that contains 4 values:\n", + "\n", + " - True Positive (TP): the number of correct positive predictions\n", + " - True Negative (TN): the number of correct negative predictions\n", + " - False Positive (FP): the number of incorrect positive predictions\n", + " - False Negative (FN): the number of incorrect negative predictions\n", + "\n", + " The confusion matrix can be used to assess the holistic performance of a classification model by showing the accuracy, precision, recall, and F1 score of the model on a single figure.\n", + " \"\"\"\n", + " y_true = dataset.y\n", + " y_pred = dataset.y_pred(model=model)\n", + "\n", + " confusion_matrix = metrics.confusion_matrix(y_true, y_pred)\n", + "\n", + " cm_display = metrics.ConfusionMatrixDisplay(\n", + " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", + " )\n", + " cm_display.plot()\n", + "\n", + " plt.close() # close the plot to avoid displaying it\n", + "\n", + " return cm_display.figure_ # return the figure object itself" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can now run the newly created custom test on both the training and test datasets for both models using the [`run_test()` function](https://docs.validmind.ai/validmind/validmind/tests.html#run_test):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Champion train and test\n", + "vm.tests.run_test(\n", + " test_id=\"my_custom_tests.ConfusionMatrix:champion\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds,vm_test_ds],\n", + " \"model\" : [vm_log_model]\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Challenger train and test\n", + "vm.tests.run_test(\n", + " test_id=\"my_custom_tests.ConfusionMatrix:challenger\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds,vm_test_ds],\n", + " \"model\" : [vm_rf_model]\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Note the output returned indicating that a test-driven block doesn't currently exist in your model's documentation for some test IDs. \n", + "

\n", + "That's expected, as when we run validations tests the results logged need to be manually added to your report as part of your compliance assessment process within the ValidMind Platform.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Add parameters to custom tests\n", + "\n", + "Custom tests can take parameters just like any other function. To demonstrate, let's modify the `confusion_matrix` function to take an additional parameter `normalize` that will allow you to normalize the confusion matrix:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.ConfusionMatrix\")\n", + "def confusion_matrix(dataset, model, normalize=False):\n", + " \"\"\"The confusion matrix is a table that is often used to describe the performance of a classification model on a set of data for which the true values are known.\n", + "\n", + " The confusion matrix is a 2x2 table that contains 4 values:\n", + "\n", + " - True Positive (TP): the number of correct positive predictions\n", + " - True Negative (TN): the number of correct negative predictions\n", + " - False Positive (FP): the number of incorrect positive predictions\n", + " - False Negative (FN): the number of incorrect negative predictions\n", + "\n", + " The confusion matrix can be used to assess the holistic performance of a classification model by showing the accuracy, precision, recall, and F1 score of the model on a single figure.\n", + " \"\"\"\n", + " y_true = dataset.y\n", + " y_pred = dataset.y_pred(model=model)\n", + "\n", + " if normalize:\n", + " confusion_matrix = metrics.confusion_matrix(y_true, y_pred, normalize=\"all\")\n", + " else:\n", + " confusion_matrix = metrics.confusion_matrix(y_true, y_pred)\n", + "\n", + " cm_display = metrics.ConfusionMatrixDisplay(\n", + " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", + " )\n", + " cm_display.plot()\n", + "\n", + " plt.close() # close the plot to avoid displaying it\n", + "\n", + " return cm_display.figure_ # return the figure object itself" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Pass parameters to custom tests\n", + "\n", + "You can pass parameters to custom tests by providing a dictionary of parameters to the `run_test()` function.\n", + "\n", + "- The parameters will override any default parameters set in the custom test definition. Note that `dataset` and `model` are still passed as `inputs`.\n", + "- Since these are `VMDataset` or `VMModel` inputs, they have a special meaning.\n", + "\n", + "Re-running and logging the custom confusion matrix with `normalize=True` for both models and our testing dataset looks like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Champion with test dataset and normalize=True\n", + "vm.tests.run_test(\n", + " test_id=\"my_custom_tests.ConfusionMatrix:test_normalized_champion\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\" : [vm_log_model]\n", + " },\n", + " params={\"normalize\": True}\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Challenger with test dataset and normalize=True\n", + "vm.tests.run_test(\n", + " test_id=\"my_custom_tests.ConfusionMatrix:test_normalized_challenger\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\" : [vm_rf_model]\n", + " },\n", + " params={\"normalize\": True}\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Use external test providers\n", + "\n", + "Sometimes you may want to reuse the same set of custom tests across multiple models and share them with others in your organization, like the model development team would have done with you in this example workflow featured in this series of notebooks. In this case, you can create an external custom *test provider* that will allow you to load custom tests from a local folder or a Git repository.\n", + "\n", + "In this section you will learn how to declare a local filesystem test provider that allows loading tests from a local folder following these high level steps:\n", + "\n", + "1. Create a folder of custom tests from existing inline tests (tests that exist in your active Jupyter Notebook)\n", + "2. Save an inline test to a file\n", + "3. Define and register a [`LocalTestProvider`](https://docs.validmind.ai/validmind/validmind/tests.html#LocalTestProvider) that points to that folder\n", + "4. Run test provider tests\n", + "5. Add the test results to your documentation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Create custom tests folder\n", + "\n", + "Let's start by creating a new folder that will contain reusable custom tests from your existing inline tests.\n", + "\n", + "The following code snippet will create a new `my_tests` directory in the current working directory if it doesn't exist:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tests_folder = \"my_tests\"\n", + "\n", + "import os\n", + "\n", + "# create tests folder\n", + "os.makedirs(tests_folder, exist_ok=True)\n", + "\n", + "# remove existing tests\n", + "for f in os.listdir(tests_folder):\n", + " # remove files and pycache\n", + " if f.endswith(\".py\") or f == \"__pycache__\":\n", + " os.system(f\"rm -rf {tests_folder}/{f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After running the command above, confirm that a new `my_tests` directory was created successfully. For example:\n", + "\n", + "```\n", + "~/notebooks/tutorials/model_validation/my_tests/\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Save an inline test\n", + "\n", + "The `@vm.test` decorator we used in **Implement a custom inline test** above to register one-off custom tests also includes a convenience method on the function object that allows you to simply call `.save()` to save the test to a Python file at a specified path.\n", + "\n", + "While `save()` will get you started by creating the file and saving the function code with the correct name, it won't automatically include any imports, or other functions or variables, outside of the functions that are needed for the test to run. To solve this, pass in an optional `imports` argument ensuring necessary imports are added to the file.\n", + "\n", + "The `confusion_matrix` test requires the following additional imports:\n", + "\n", + "```python\n", + "import matplotlib.pyplot as plt\n", + "from sklearn import metrics\n", + "```\n", + "\n", + "Let's pass these imports to the `save()` method to ensure they are included in the file with the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "confusion_matrix.save(\n", + " # Save it to the custom tests folder we created\n", + " tests_folder,\n", + " imports=[\"import matplotlib.pyplot as plt\", \"from sklearn import metrics\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- [x] Confirm that the `save()` method saved the `confusion_matrix` function to a file named `ConfusionMatrix.py` in the `my_tests` folder.\n", + "- [x] Note that the new file provides some context on the origin of the test, which is useful for traceability:\n", + "\n", + " ```\n", + " # Saved from __main__.confusion_matrix\n", + " # Original Test ID: my_custom_tests.ConfusionMatrix\n", + " # New Test ID: .ConfusionMatrix\n", + " ```\n", + "\n", + "- [x] Additionally, the new test function has been stripped off its decorator, as it now resides in a file that will be loaded by the test provider:\n", + "\n", + " ```python\n", + " def ConfusionMatrix(dataset, model, normalize=False):\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register a local test provider\n", + "\n", + "Now that your `my_tests` folder has a sample custom test, let's initialize a test provider that will tell the ValidMind Library where to find your custom tests:\n", + "\n", + "- ValidMind offers out-of-the-box test providers for local tests (tests in a folder) or a Github provider for tests in a Github repository.\n", + "- You can also create your own test provider by creating a class that has a [`load_test` method](https://docs.validmind.ai/validmind/validmind/tests.html#load_test) that takes a test ID and returns the test function matching that ID.\n", + "\n", + "
Want to learn more about test providers?\n", + "

\n", + "An extended introduction to test providers can be found in: Integrate external test providers
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Initialize a local test provider\n", + "\n", + "For most use cases, using a `LocalTestProvider` that allows you to load custom tests from a designated directory should be sufficient.\n", + "\n", + "**The most important attribute for a test provider is its `namespace`.** This is a string that will be used to prefix test IDs in model documentation. This allows you to have multiple test providers with tests that can even share the same ID, but are distinguished by their namespace.\n", + "\n", + "Let's go ahead and load the custom tests from our `my_tests` directory:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import LocalTestProvider\n", + "\n", + "# initialize the test provider with the tests folder we created earlier\n", + "my_test_provider = LocalTestProvider(tests_folder)\n", + "\n", + "vm.tests.register_test_provider(\n", + " namespace=\"my_test_provider\",\n", + " test_provider=my_test_provider,\n", + ")\n", + "# `my_test_provider.load_test()` will be called for any test ID that starts with `my_test_provider`\n", + "# e.g. `my_test_provider.ConfusionMatrix` will look for a function named `ConfusionMatrix` in `my_tests/ConfusionMatrix.py` file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Run test provider tests\n", + "\n", + "Now that we've set up the test provider, we can run any test that's located in the tests folder by using the `run_test()` method as with any other test:\n", + "\n", + "- For tests that reside in a test provider directory, the test ID will be the `namespace` specified when registering the provider, followed by the path to the test file relative to the tests folder.\n", + "- For example, the Confusion Matrix test we created earlier will have the test ID `my_test_provider.ConfusionMatrix`. You could organize the tests in subfolders, say `classification` and `regression`, and the test ID for the Confusion Matrix test would then be `my_test_provider.classification.ConfusionMatrix`.\n", + "\n", + "Let's go ahead and re-run the confusion matrix test with our testing dataset for our two models by using the test ID `my_test_provider.ConfusionMatrix`. This should load the test from the test provider and run it as before." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Champion with test dataset and test provider custom test\n", + "vm.tests.run_test(\n", + " test_id=\"my_test_provider.ConfusionMatrix:champion\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\" : [vm_log_model]\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Challenger with test dataset and test provider custom test\n", + "vm.tests.run_test(\n", + " test_id=\"my_test_provider.ConfusionMatrix:challenger\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\" : [vm_rf_model]\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Verify test runs\n", + "\n", + "Our final task is to verify that all the tests provided by the model development team were run and reported accurately. Note the appended `result_ids` to delineate which dataset we ran the test with for the relevant tests.\n", + "\n", + "Here, we'll specify all the tests we'd like to independently rerun in a dictionary called `test_config`. **Note here that `inputs` and `input_grid` expect the `input_id` of the dataset or model as the value rather than the variable name we specified**:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_config = {\n", + " # Run with the raw dataset\n", + " 'validmind.data_validation.DatasetDescription:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'}\n", + " },\n", + " 'validmind.data_validation.DescriptiveStatistics:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'}\n", + " },\n", + " 'validmind.data_validation.MissingValues:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'min_threshold': 1}\n", + " },\n", + " 'validmind.data_validation.ClassImbalance:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'min_percent_threshold': 10}\n", + " },\n", + " 'validmind.data_validation.Duplicates:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'min_threshold': 1}\n", + " },\n", + " 'validmind.data_validation.HighCardinality:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {\n", + " 'num_threshold': 100,\n", + " 'percent_threshold': 0.1,\n", + " 'threshold_type': 'percent'\n", + " }\n", + " },\n", + " 'validmind.data_validation.Skewness:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'max_threshold': 1}\n", + " },\n", + " 'validmind.data_validation.UniqueRows:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'min_percent_threshold': 1}\n", + " },\n", + " 'validmind.data_validation.TooManyZeroValues:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'max_percent_threshold': 0.03}\n", + " },\n", + " 'validmind.data_validation.IQROutliersTable:raw_data': {\n", + " 'inputs': {'dataset': 'raw_dataset'},\n", + " 'params': {'threshold': 5}\n", + " },\n", + " # Run with the preprocessed dataset\n", + " 'validmind.data_validation.DescriptiveStatistics:preprocessed_data': {\n", + " 'inputs': {'dataset': 'raw_dataset_preprocessed'}\n", + " },\n", + " 'validmind.data_validation.TabularDescriptionTables:preprocessed_data': {\n", + " 'inputs': {'dataset': 'raw_dataset_preprocessed'}\n", + " },\n", + " 'validmind.data_validation.MissingValues:preprocessed_data': {\n", + " 'inputs': {'dataset': 'raw_dataset_preprocessed'},\n", + " 'params': {'min_threshold': 1}\n", + " },\n", + " 'validmind.data_validation.TabularNumericalHistograms:preprocessed_data': {\n", + " 'inputs': {'dataset': 'raw_dataset_preprocessed'}\n", + " },\n", + " 'validmind.data_validation.TabularCategoricalBarPlots:preprocessed_data': {\n", + " 'inputs': {'dataset': 'raw_dataset_preprocessed'}\n", + " },\n", + " 'validmind.data_validation.TargetRateBarPlots:preprocessed_data': {\n", + " 'inputs': {'dataset': 'raw_dataset_preprocessed'},\n", + " 'params': {'default_column': 'loan_status'}\n", + " },\n", + " # Run with the training and test datasets\n", + " 'validmind.data_validation.DescriptiveStatistics:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset_final', 'test_dataset_final']}\n", + " },\n", + " 'validmind.data_validation.TabularDescriptionTables:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset_final', 'test_dataset_final']}\n", + " },\n", + " 'validmind.data_validation.ClassImbalance:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset_final', 'test_dataset_final']},\n", + " 'params': {'min_percent_threshold': 10}\n", + " },\n", + " 'validmind.data_validation.UniqueRows:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset_final', 'test_dataset_final']},\n", + " 'params': {'min_percent_threshold': 1}\n", + " },\n", + " 'validmind.data_validation.TabularNumericalHistograms:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset_final', 'test_dataset_final']}\n", + " },\n", + " 'validmind.data_validation.MutualInformation:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset_final', 'test_dataset_final']},\n", + " 'params': {'min_threshold': 0.01}\n", + " },\n", + " 'validmind.data_validation.PearsonCorrelationMatrix:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset_final', 'test_dataset_final']}\n", + " },\n", + " 'validmind.data_validation.HighPearsonCorrelation:development_data': {\n", + " 'input_grid': {'dataset': ['train_dataset_final', 'test_dataset_final']},\n", + " 'params': {'max_threshold': 0.3, 'top_n_correlations': 10}\n", + " },\n", + " 'validmind.model_validation.ModelMetadata': {\n", + " 'input_grid': {'model': ['log_model_champion', 'rf_model']}\n", + " },\n", + " 'validmind.model_validation.sklearn.ModelParameters': {\n", + " 'input_grid': {'model': ['log_model_champion', 'rf_model']}\n", + " },\n", + " 'validmind.model_validation.sklearn.ROCCurve': {\n", + " 'input_grid': {'dataset': ['train_dataset_final', 'test_dataset_final'], 'model': ['log_model_champion']}\n", + " },\n", + " 'validmind.model_validation.sklearn.MinimumROCAUCScore': {\n", + " 'input_grid': {'dataset': ['train_dataset_final', 'test_dataset_final'], 'model': ['log_model_champion']},\n", + " 'params': {'min_threshold': 0.5}\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then batch run and log our tests in `test_config`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for t in test_config:\n", + " print(t)\n", + " try:\n", + " # Check if test has input_grid\n", + " if 'input_grid' in test_config[t]:\n", + " # For tests with input_grid, pass the input_grid configuration\n", + " if 'params' in test_config[t]:\n", + " vm.tests.run_test(t, input_grid=test_config[t]['input_grid'], params=test_config[t]['params']).log()\n", + " else:\n", + " vm.tests.run_test(t, input_grid=test_config[t]['input_grid']).log()\n", + " else:\n", + " # Original logic for regular inputs\n", + " if 'params' in test_config[t]:\n", + " vm.tests.run_test(t, inputs=test_config[t]['inputs'], params=test_config[t]['params']).log()\n", + " else:\n", + " vm.tests.run_test(t, inputs=test_config[t]['inputs']).log()\n", + " except Exception as e:\n", + " print(f\"Error running test {t}: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## In summary\n", + "\n", + "In this final notebook, you learned how to:\n", + "\n", + "- [x] Implement a custom inline test\n", + "- [x] Run and log your custom inline tests\n", + "- [x] Use external custom test providers\n", + "- [x] Run and log tests from your custom test providers\n", + "- [x] Re-run tests provided by your model development team to verify that they were run and reported accurately\n", + "\n", + "With our ValidMind for model validation series of notebooks, you learned how to validate a model end-to-end with the ValidMind Library by running through some common scenarios in a typical model validation setting:\n", + "\n", + "- Verifying the data quality steps performed by the model development team\n", + "- Independently replicating the champion model's results and conducting additional tests to assess performance, stability, and robustness\n", + "- Setting up test inputs and a challenger model for comparative analysis\n", + "- Running validation tests, analyzing results, and logging findings to ValidMind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Work with your validation report\n", + "\n", + "Now that you've logged all your test results and verified the work done by the model development team, head to the ValidMind Platform to wrap up your validation report. Continue to work on your validation report by:\n", + "\n", + "- **Inserting additional test results:** Click **Link Evidence to Report** under any section of 2. Validation in your validation report. (Learn more: [Link evidence to reports](https://docs.validmind.ai/guide/model-validation/assess-compliance.html#link-evidence-to-reports))\n", + "\n", + "- **Making qualitative edits to your test descriptions:** Expand any linked evidence under Validator Evidence and click **See evidence details** to review and edit the ValidMind-generated test descriptions for quality and accuracy.\n", + "\n", + "- **Adding more findings:** Click **Link Finding to Report** in any validation report section, then click **+ Create New Finding**. (Learn more: [Add and manage model findings](https://docs.validmind.ai/guide/model-validation/add-manage-model-findings.html))\n", + "\n", + "- **Adding risk assessment notes:** Click under **Risk Assessment Notes** in any validation report section to access the text editor and content editing toolbar, including an option to generate a draft with AI. Edit your ValidMind-generated test descriptions (Learn more: [Work with content blocks](https://docs.validmind.ai/guide/model-documentation/work-with-content-blocks.html#content-editing-toolbar))\n", + "\n", + "- **Assessing compliance:** Under the Guideline for any validation report section, click **ASSESSMENT** and select the compliance status from the drop-down menu. (Learn more: [Provide compliance assessments](https://docs.validmind.ai/guide/model-validation/assess-compliance.html#provide-compliance-assessments))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Learn more\n", + "\n", + "Now that you're familiar with the basics, you can explore the following notebooks to get a deeper understanding on how the ValidMind Library assists you in streamlining model validation:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### More how-to guides and code samples\n", + "\n", + "- [Explore available tests in detail](../../how_to/explore_tests.ipynb)\n", + "- [In-depth guide on running dataset based tests](../../how_to/run_tests/1_run_dataset_based_tests.ipynb)\n", + "- [In-depth guide for running comparison tests](../../how_to/run_tests/2_run_comparison_tests.ipynb)\n", + "- [In-depth guide for implementing custom tests](../../code_samples/custom_tests/implement_custom_tests.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Discover more learning resources\n", + "\n", + "All notebook samples can be found in the following directories of the ValidMind Library GitHub repository:\n", + "\n", + "- [Code samples](https://github.com/validmind/validmind-library/tree/main/notebooks/code_samples)\n", + "- [How-to guides](https://github.com/validmind/validmind-library/tree/main/notebooks/how_to)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ValidMind Library", + "language": "python", + "name": "validmind" + }, + "language_info": { + "name": "python", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/tutorials/model_validation/class-imbalance-results-detail.png b/site/notebooks/tutorials/model_validation/class-imbalance-results-detail.png new file mode 100644 index 0000000000..00251ecaac Binary files /dev/null and b/site/notebooks/tutorials/model_validation/class-imbalance-results-detail.png differ diff --git a/site/notebooks/tutorials/model_validation/compliance-summary.png b/site/notebooks/tutorials/model_validation/compliance-summary.png new file mode 100644 index 0000000000..a8abd2dd39 Binary files /dev/null and b/site/notebooks/tutorials/model_validation/compliance-summary.png differ diff --git a/site/notebooks/tutorials/model_validation/inserted-class-imbalance-results.png b/site/notebooks/tutorials/model_validation/inserted-class-imbalance-results.png new file mode 100644 index 0000000000..2efea1a09b Binary files /dev/null and b/site/notebooks/tutorials/model_validation/inserted-class-imbalance-results.png differ diff --git a/site/notebooks/tutorials/model_validation/inserted-finding.png b/site/notebooks/tutorials/model_validation/inserted-finding.png new file mode 100644 index 0000000000..bb78436532 Binary files /dev/null and b/site/notebooks/tutorials/model_validation/inserted-finding.png differ diff --git a/site/notebooks/tutorials/model_validation/inserted-minimum-f1-scores.png b/site/notebooks/tutorials/model_validation/inserted-minimum-f1-scores.png new file mode 100644 index 0000000000..52ae43c719 Binary files /dev/null and b/site/notebooks/tutorials/model_validation/inserted-minimum-f1-scores.png differ diff --git a/site/notebooks/tutorials/model_validation/link-finding.png b/site/notebooks/tutorials/model_validation/link-finding.png new file mode 100644 index 0000000000..dd1c8a2662 Binary files /dev/null and b/site/notebooks/tutorials/model_validation/link-finding.png differ diff --git a/site/notebooks/tutorials/model_validation/link-validator-evidence.png b/site/notebooks/tutorials/model_validation/link-validator-evidence.png new file mode 100644 index 0000000000..7403cad676 Binary files /dev/null and b/site/notebooks/tutorials/model_validation/link-validator-evidence.png differ diff --git a/site/notebooks/tutorials/model_validation/link-validator-evidence_OLD.png b/site/notebooks/tutorials/model_validation/link-validator-evidence_OLD.png new file mode 100644 index 0000000000..58823e8f21 Binary files /dev/null and b/site/notebooks/tutorials/model_validation/link-validator-evidence_OLD.png differ diff --git a/site/notebooks/tutorials/model_validation/lr_model_champion.pkl b/site/notebooks/tutorials/model_validation/lr_model_champion.pkl new file mode 100644 index 0000000000..9b81662b24 Binary files /dev/null and b/site/notebooks/tutorials/model_validation/lr_model_champion.pkl differ diff --git a/site/notebooks/tutorials/model_validation/select-finding.png b/site/notebooks/tutorials/model_validation/select-finding.png new file mode 100644 index 0000000000..ba35661d58 Binary files /dev/null and b/site/notebooks/tutorials/model_validation/select-finding.png differ diff --git a/site/notebooks/tutorials/model_validation/selecting-class-imbalance-results.png b/site/notebooks/tutorials/model_validation/selecting-class-imbalance-results.png new file mode 100644 index 0000000000..cf86874228 Binary files /dev/null and b/site/notebooks/tutorials/model_validation/selecting-class-imbalance-results.png differ diff --git a/site/notebooks/tutorials/model_validation/selecting-minimum-f1-scores.png b/site/notebooks/tutorials/model_validation/selecting-minimum-f1-scores.png new file mode 100644 index 0000000000..60ae6b9605 Binary files /dev/null and b/site/notebooks/tutorials/model_validation/selecting-minimum-f1-scores.png differ diff --git a/site/tests/_metadata.yml b/site/tests/_metadata.yml index 0be10fd2f4..cb9ee5eeea 100644 --- a/site/tests/_metadata.yml +++ b/site/tests/_metadata.yml @@ -1,4 +1,10 @@ format: html: + grid: + sidebar-width: 450px + margin-width: 450px page-layout: full - css: /developer/developer.css \ No newline at end of file + from: markdown-smart + css: + - /validmind/validmind.css + - /developer/developer.css \ No newline at end of file diff --git a/site/validmind/_sidebar.yml b/site/validmind/_sidebar.yml index 50a77a540b..3ff543f9ac 100644 --- a/site/validmind/_sidebar.yml +++ b/site/validmind/_sidebar.yml @@ -2,7 +2,7 @@ website: sidebar: - id: validmind-reference - title: "ValidMind Library" + title: "ValidMind Library Python API" collapsed: false collapse-level: 2 contents: @@ -10,7 +10,7 @@ website: - text: "---" - text: "Python API" # Root level items from validmind.qmd - - text: "`2.8.12`" + - text: "`2.8.18`" file: validmind/validmind.qmd#version__ - text: "init" file: validmind/validmind.qmd#init @@ -40,6 +40,8 @@ website: file: validmind/validmind.qmd#tasks - text: "test" file: validmind/validmind.qmd#test + - text: "log_text" + file: validmind/validmind.qmd#log_text - text: " RawData" file: validmind/validmind.qmd#rawdata contents: diff --git a/site/validmind/validmind.qmd b/site/validmind/validmind.qmd index d946024b1a..b392295385 100644 --- a/site/validmind/validmind.qmd +++ b/site/validmind/validmind.qmd @@ -1,5 +1,5 @@ --- -title: "ValidMind Library" +title: "ValidMind Library Python API" aliases: - index.html sidebar: validmind-reference @@ -44,7 +44,7 @@ After you have pasted the code snippet into your development source code and exe ::: {.signature} -2.8.12 +2.8.18 ::: @@ -66,7 +66,7 @@ If the API key and secret are not provided, the client will attempt to retrieve **Arguments** -- `project (str, optional)`: The project CUID. Alias for model. Defaults to None. [DEPRECATED] +- `project (str, optional)`: The project CUID. Alias for model. Defaults to None. \[DEPRECATED\] - `model (str, optional)`: The model CUID. Defaults to None. - `api_key (str, optional)`: The API key. Defaults to None. - `api_secret (str, optional)`: The API secret. Defaults to None. @@ -225,11 +225,11 @@ Unit metrics are key-value pairs where the key is the metric name and the value **Arguments** - `key (str)`: The metric key -- `value (float)`: The metric value -- `inputs (list)`: A list of input IDs that were used to compute the metric. -- `params (dict)`: Dictionary of parameters used to compute the metric. -- `recorded_at (str)`: The timestamp of the metric. Server will use current time if not provided. -- `thresholds (dict)`: Dictionary of thresholds for the metric. +- `value (Union[int, float])`: The metric value +- `inputs (List[str])`: List of input IDs +- `params (Dict[str, Any])`: Parameters used to generate the metric +- `recorded_at (str)`: Timestamp when the metric was recorded +- `thresholds (Dict[str, Any])`: Thresholds for the metric ## preview_template @@ -421,6 +421,35 @@ The function may also include a docstring. This docstring will be used and logge - The decorated function. +## log_text + + + +::: {.signature} + +deflog_text(content_id:str,text:str,\_json:Optional\[Dict\[str, Any\]\]=None)Dict\[str, Any\]: + +::: + + + +Logs free-form text to ValidMind API. + +**Arguments** + +- `content_id (str)`: Unique content identifier for the text. +- `text (str)`: The text to log. Will be converted to HTML with MathML support. +- `_json (dict, optional)`: Additional metadata to associate with the text. Defaults to None. + +**Returns** + +- An accordion widget containing the logged text as HTML. + +**Raises** + +- `ValueError`: If content_id or text are empty or not strings. +- `Exception`: If the API call fails. + ## RawData diff --git a/site/validmind/validmind/errors.qmd b/site/validmind/validmind/errors.qmd index a1b02e1e82..8754de29cc 100644 --- a/site/validmind/validmind/errors.qmd +++ b/site/validmind/validmind/errors.qmd @@ -610,6 +610,27 @@ When an invalid metric results object is sent to the API. - [APIRequestError](#apirequesterror) - builtins.BaseException with_traceback, add_note +### InvalidParameterError + + + +::: {.signature} + +classInvalidParameterError(BaseError): + +::: + + + +When an invalid parameter is provided. + + + +**Inherited members** + +- [BaseError](#baseerror), [description](#description) +- builtins.BaseException with_traceback, add_note + ### InvalidProjectError diff --git a/site/validmind/validmind/tests.qmd b/site/validmind/validmind/tests.qmd index f77ae3ea2d..31cb608822 100644 --- a/site/validmind/validmind/tests.qmd +++ b/site/validmind/validmind/tests.qmd @@ -20,13 +20,21 @@ ValidMind Tests Module ::: {.signature} -deflist_tests(filter:Optional\[str\]=None,task:Optional\[str\]=None,tags:Optional\[List\[str\]\]=None,pretty:bool=True,truncate:bool=True)Union\[Dict\[str, Callable\[..., Any\]\], None\]: +deflist_tests(filter:Optional\[str\]=None,task:Optional\[str\]=None,tags:Optional\[List\[str\]\]=None,pretty:bool=True,truncate:bool=True)Union\[List\[str\], None\]: ::: -List all available tests with optional filtering +List all tests in the tests directory. + +**Arguments** + +- `filter (str, optional)`: Find tests where the ID, tasks or tags match the filter string. Defaults to None. +- `task (str, optional)`: Find tests that match the task. Can be used to narrow down matches from the filter string. Defaults to None. +- `tags (list, optional)`: Find tests that match list of tags. Can be used to narrow down matches from the filter string. Defaults to None. +- `pretty (bool, optional)`: If True, returns a pandas DataFrame with a formatted table. Defaults to True. +- `truncate (bool, optional)`: If True, truncates the test description to the first line. Defaults to True. (only used if pretty=True) ## load_test @@ -48,6 +56,7 @@ Test IDs are in the format `namespace.path_to_module.TestClassOrFuncName[:tag]`. - `test_id (str)`: The test ID in the format `namespace.path_to_module.TestName[:tag]` - `test_func (callable, optional)`: The test function to load. If not provided, the test will be loaded from the test provider. Defaults to None. +- `reload (bool, optional)`: If True, reload the test even if it's already loaded. Defaults to False. ## describe_test @@ -61,7 +70,14 @@ Test IDs are in the format `namespace.path_to_module.TestClassOrFuncName[:tag]`. -Describe a test's functionality and parameters +Get or show details about the test + +This function can be used to see test details including the test name, description, required inputs and default params. It can also be used to get a dictionary of the above information for programmatic use. + +**Arguments** + +- `test_id (str, optional)`: The test ID. Defaults to None. +- `raw (bool, optional)`: If True, returns a dictionary with the test details. Defaults to False. ## run_test @@ -69,7 +85,7 @@ Describe a test's functionality and parameters ::: {.signature} -defrun_test(test_id:Union\[TestID (Union of validmind.data_validation.\*, validmind.model_validation.\*, validmind.prompt_validation.\* and str), None\]=None,name:Union\[str, None\]=None,unit_metrics:Union\[List\[TestID (Unit metrics from validmind.unit_metrics.\*)\], None\]=None,inputs:Union\[Dict\[str, Any\], None\]=None,input_grid:Union\[Dict\[str, List\[Any\]\], List\[Dict\[str, Any\]\], None\]=None,params:Union\[Dict\[str, Any\], None\]=None,param_grid:Union\[Dict\[str, List\[Any\]\], List\[Dict\[str, Any\]\], None\]=None,show:bool=True,generate_description:bool=True,title:Optional\[str\]=None,post_process_fn:Union\[Callable\[\[validmind.vm_models.TestResult\], None\], None\]=None,\*\*kwargs)validmind.vm_models.TestResult: +defrun_test(test_id:Union\[TestID (Union of validmind.data_validation.\*, validmind.model_validation.\*, validmind.prompt_validation.\* and str), None\]=None,name:Union\[str, None\]=None,unit_metrics:Union\[List\[TestID (Unit metrics from validmind.unit_metrics.\*)\], None\]=None,inputs:Union\[Dict\[str, Any\], None\]=None,input_grid:Union\[Dict\[str, List\[Any\]\], List\[Dict\[str, Any\]\], None\]=None,params:Union\[Dict\[str, Any\], None\]=None,param_grid:Union\[Dict\[str, List\[Any\]\], List\[Dict\[str, Any\]\], None\]=None,show:bool=True,generate_description:bool=True,title:Optional\[str\]=None,post_process_fn:Union\[Callable\[\[validmind.vm_models.TestResult\], None\], None\]=None,show_params:bool=True,\*\*kwargs)validmind.vm_models.TestResult: ::: @@ -96,6 +112,7 @@ This function is the main entry point for running tests. It can run simple unit - `generate_description (bool, optional)`: Whether to generate a description. Defaults to True. - `title (str)`: Custom title for the test result - `post_process_fn (Callable[[TestResult], None])`: Function to post-process the test result +- `show_params (bool, optional)`: Whether to include parameter values in figure titles for comparison tests. Defaults to True. **Returns** @@ -112,13 +129,13 @@ This function is the main entry point for running tests. It can run simple unit ::: {.signature} -deflist_tags()Set\[str\]: +deflist_tags()List\[str\]: ::: -List all available tags +List all unique available tags ## list_tasks @@ -126,13 +143,13 @@ List all available tags ::: {.signature} -deflist_tasks()Set\[str\]: +deflist_tasks()List\[str\]: ::: -List all available tasks +List all unique available tasks ## list_tasks_and_tags @@ -146,7 +163,11 @@ List all available tasks -List all available tasks and tags +List all task types and their associated tags, with one row per task type and all tags for a task type in one row. + +**Returns** + +- A DataFrame with 'Task Type' and concatenated 'Tags'. ## test diff --git a/site/validmind/validmind/tests/data_validation/ClassImbalance.qmd b/site/validmind/validmind/tests/data_validation/ClassImbalance.qmd index ccfa981c9a..4506dd76d7 100644 --- a/site/validmind/validmind/tests/data_validation/ClassImbalance.qmd +++ b/site/validmind/validmind/tests/data_validation/ClassImbalance.qmd @@ -18,7 +18,7 @@ Threshold based tests ::: {.signature} -@tags('tabular_data', 'binary_classification', 'multiclass_classification') +@tags('tabular_data', 'binary_classification', 'multiclass_classification', 'data_quality') @tasks('classification') diff --git a/site/validmind/validmind/tests/data_validation/DatasetDescription.qmd b/site/validmind/validmind/tests/data_validation/DatasetDescription.qmd index 88a04ecb7d..c3c8e31fc5 100644 --- a/site/validmind/validmind/tests/data_validation/DatasetDescription.qmd +++ b/site/validmind/validmind/tests/data_validation/DatasetDescription.qmd @@ -105,15 +105,3 @@ Will be used in favor of \_get_histogram in the future Returns a collection of histograms for a numerical column, each one with a different bin size - - - -## infer_datatypes - - - -::: {.signature} - -definfer_datatypes(df): - -::: diff --git a/site/validmind/validmind/tests/data_validation/DescriptiveStatistics.qmd b/site/validmind/validmind/tests/data_validation/DescriptiveStatistics.qmd index 2efa3c057f..d2fa820db8 100644 --- a/site/validmind/validmind/tests/data_validation/DescriptiveStatistics.qmd +++ b/site/validmind/validmind/tests/data_validation/DescriptiveStatistics.qmd @@ -14,7 +14,7 @@ toc-expand: 4 ::: {.signature} -@tags('tabular_data', 'time_series_data') +@tags('tabular_data', 'time_series_data', 'data_quality') @tasks('classification', 'regression') diff --git a/site/validmind/validmind/tests/data_validation/IQROutliersBarPlot.qmd b/site/validmind/validmind/tests/data_validation/IQROutliersBarPlot.qmd index fa3f20eda4..ca5ed977de 100644 --- a/site/validmind/validmind/tests/data_validation/IQROutliersBarPlot.qmd +++ b/site/validmind/validmind/tests/data_validation/IQROutliersBarPlot.qmd @@ -49,7 +49,7 @@ The examination invokes a series of steps: 1. For every numeric feature in the dataset, the 25th percentile (Q1) and 75th percentile (Q3) are calculated before deriving the Interquartile Range (IQR), the difference between Q1 and Q3. 1. Subsequently, the metric calculates the lower and upper thresholds by subtracting Q1 from the `threshold` times IQR and adding Q3 to `threshold` times IQR, respectively. The default `threshold` is set at 1.5. 1. Any value in the feature that falls below the lower threshold or exceeds the upper threshold is labeled as an outlier. -1. The number of outliers are tallied for different percentiles, such as [0-25], [25-50], [50-75], and [75-100]. +1. The number of outliers are tallied for different percentiles, such as \[0-25\], \[25-50\], \[50-75\], and \[75-100\]. 1. These counts are employed to construct a bar plot for the feature, showcasing the distribution of outliers across different percentiles. ### Signs of High Risk diff --git a/site/validmind/validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.qmd b/site/validmind/validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.qmd index b17dbf87db..fac8d44063 100644 --- a/site/validmind/validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.qmd +++ b/site/validmind/validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.qmd @@ -77,7 +77,7 @@ The test implements multiple threshold optimization methods: - `dataset`: VMDataset containing features and target - `model`: VMModel containing predictions -- `methods`: List of methods to compare (default: ['youden', 'f1', 'precision_recall']) +- `methods`: List of methods to compare (default: \['youden', 'f1', 'precision_recall'\]) - `target_recall`: Target recall value if using 'target_recall' method **Returns** diff --git a/site/validmind/validmind/version.qmd b/site/validmind/validmind/version.qmd index be67330350..c84426b755 100644 --- a/site/validmind/validmind/version.qmd +++ b/site/validmind/validmind/version.qmd @@ -9,6 +9,6 @@ sidebar: validmind-reference ::: {.signature} -2.8.12 +2.8.18 ::: diff --git a/site/validmind/validmind/vm_models.qmd b/site/validmind/validmind/vm_models.qmd index 7d195fe802..8633c5f48a 100644 --- a/site/validmind/validmind/vm_models.qmd +++ b/site/validmind/validmind/vm_models.qmd @@ -16,7 +16,7 @@ Models entrypoint ::: {.signature} -R_MODEL_TYPES= ['LogisticRegression', 'LinearRegression', 'XGBClassifier', 'XGBRegressor']: +R_MODEL_TYPES= \['LogisticRegression', 'LinearRegression', 'XGBClassifier', 'XGBRegressor'\]: ::: @@ -688,7 +688,7 @@ Add a new table to the result. **Arguments** - `table (Union[ResultTable, pd.DataFrame, List[Dict[str, Any]]])`: The table to add. -- `title (Optional[str])`: The title of the table (can optionally be provided for pd.DataFrame and List\[Dict[str, Any]\] tables). +- `title (Optional[str])`: The title of the table (can optionally be provided for pd.DataFrame and List\[Dict\[str, Any\]\] tables). ### check_result_id_exist @@ -710,7 +710,7 @@ Check if the result_id exists in any test block across all sections. ::: {.signature} -deflog(self,section_id:str=None,position:int=None,unsafe:bool=False): +deflog(self,section_id:str=None,position:int=None,unsafe:bool=False,config:Dict\[str, bool\]=None): ::: @@ -723,6 +723,12 @@ Log the result to ValidMind. - `section_id (str)`: The section ID within the model document to insert the test result. - `position (int)`: The position (index) within the section to insert the test result. - `unsafe (bool)`: If True, log the result even if it contains sensitive data i.e. raw data from input datasets. +- `config (Dict[str, bool])`: Configuration options for displaying the test result. Available config options: +- hideTitle: Hide the title in the document view +- hideText: Hide the description text in the document view +- hideParams: Hide the parameters in the document view +- hideTables: Hide tables in the document view +- hideFigures: Hide figures in the document view ### log_async @@ -730,7 +736,7 @@ Log the result to ValidMind. ::: {.signature} -async deflog_async(self,section_id:str=None,position:int=None,unsafe:bool=False): +async deflog_async(self,section_id:str=None,position:int=None,config:Dict\[str, bool\]=None): ::: @@ -794,6 +800,28 @@ Serialize the result for the API. ::: +### validate_log_config + + + +::: {.signature} + +defvalidate_log_config(self,config:Dict\[str, bool\]): + +::: + + + +Validate the configuration options for logging a test result + +**Arguments** + +- `config (Dict[str, bool])`: Configuration options to validate + +**Raises** + +- `InvalidParameterError`: If config contains invalid keys or non-boolean values + ### test_name{.property}