diff --git a/.github/actions/demo-notebook/action.yml b/.github/actions/demo-notebook/action.yml index bb7914a727..8e014230b8 100644 --- a/.github/actions/demo-notebook/action.yml +++ b/.github/actions/demo-notebook/action.yml @@ -6,9 +6,9 @@ inputs: description: "Load the created .env file" required: true -runs: +runs: using: "composite" - steps: + steps: - name: Install python3 for Jupyter Notebooks shell: bash run: | @@ -18,10 +18,11 @@ runs: - name: Install validmind for notebook execution shell: bash run: | - pip install validmind - pip install validmind[llm] - pip install fairlearn aequitas + pip install validmind + pip install validmind[llm] + pip install fairlearn aequitas pip install shap==0.44.1 + pip install anywidget - name: Ensure .env file is available shell: bash @@ -36,9 +37,9 @@ runs: shell: bash if: ${{ steps.find_env.outcome == 'success' }} run: | - cd site + cd site source ../${{ inputs.env_file }} - quarto render --profile exe-demo notebooks/tutorials/intro_for_model_developers_EXECUTED.ipynb &> render_errors.log || { + quarto render --profile exe-demo notebooks/tutorials/intro_for_model_developers_EXECUTED.ipynb &> render_errors.log || { echo "Execute for intro_for_model_developers_EXECUTED.ipynb failed"; cat render_errors.log; exit 1; diff --git a/.github/actions/prod-notebook/action.yml b/.github/actions/prod-notebook/action.yml index fc17da0dda..ce8612da46 100644 --- a/.github/actions/prod-notebook/action.yml +++ b/.github/actions/prod-notebook/action.yml @@ -6,9 +6,9 @@ inputs: description: "Load the created .env file" required: true -runs: +runs: using: "composite" - steps: + steps: - name: Install python3 for Jupyter Notebooks shell: bash run: | @@ -18,10 +18,11 @@ runs: - name: Install validmind for notebook execution shell: bash run: | - pip install validmind - pip install validmind[llm] - pip install fairlearn aequitas + pip install validmind + pip install validmind[llm] + pip install fairlearn aequitas pip install shap==0.44.1 + pip install anywidget - name: Ensure .env file is available shell: bash @@ -36,9 +37,9 @@ runs: shell: bash if: ${{ steps.find_env.outcome == 'success' }} run: | - cd site + cd site source ../${{ inputs.env_file }} - quarto render --profile exe-prod notebooks/tutorials/intro_for_model_developers_EXECUTED.ipynb &> render_errors.log || { + quarto render --profile exe-prod notebooks/tutorials/intro_for_model_developers_EXECUTED.ipynb &> render_errors.log || { echo "Execute for intro_for_model_developers_EXECUTED.ipynb failed"; cat render_errors.log; exit 1; diff --git a/.github/actions/staging-notebook/action.yml b/.github/actions/staging-notebook/action.yml index 4dfb84506c..f53d395380 100644 --- a/.github/actions/staging-notebook/action.yml +++ b/.github/actions/staging-notebook/action.yml @@ -6,9 +6,9 @@ inputs: description: "Load the created .env file" required: true -runs: +runs: using: "composite" - steps: + steps: - name: Install python3 for Jupyter Notebooks shell: bash run: | @@ -18,10 +18,11 @@ runs: - name: Install validmind for notebook execution shell: bash run: | - pip install validmind - pip install validmind[llm] - pip install fairlearn aequitas + pip install validmind + pip install validmind[llm] + pip install fairlearn aequitas pip install shap==0.44.1 + pip install anywidget - name: Ensure .env file is available shell: bash @@ -36,9 +37,9 @@ runs: shell: bash if: ${{ steps.find_env.outcome == 'success' }} run: | - cd site + cd site source ../${{ inputs.env_file }} - quarto render --profile exe-staging notebooks/tutorials/intro_for_model_developers_EXECUTED.ipynb &> render_errors.log || { + quarto render --profile exe-staging notebooks/tutorials/intro_for_model_developers_EXECUTED.ipynb &> render_errors.log || { echo "Execute for intro_for_model_developers_EXECUTED.ipynb failed"; cat render_errors.log; exit 1; diff --git a/.github/workflows/deploy-docs-prod.yaml b/.github/workflows/deploy-docs-prod.yaml index 5378cb7a05..8948ec862a 100644 --- a/.github/workflows/deploy-docs-prod.yaml +++ b/.github/workflows/deploy-docs-prod.yaml @@ -28,8 +28,8 @@ jobs: - name: Render prod docs site run: | - cd site - quarto render --profile production &> render_errors.log || { + cd site + quarto render --profile production &> render_errors.log || { echo "Quarto render failed immediately"; cat render_errors.log; exit 1; @@ -39,11 +39,11 @@ jobs: id: create_env run: | touch .env - echo VM_API_HOST=${{ secrets.PLATFORM_API_HOST }} >> .env - echo VM_API_KEY=${{ secrets.PLATFORM_API_KEY }} >> .env - echo VM_API_SECRET=${{ secrets.PLATFORM_API_SECRET }} >> .env - echo VM_API_MODEL=${{ secrets.PLATFORM_DEV_MODEL }} >> .env - cat .env + echo VM_API_HOST=${{ secrets.PLATFORM_API_HOST }} >> .env + echo VM_API_KEY=${{ secrets.PLATFORM_API_KEY }} >> .env + echo VM_API_SECRET=${{ secrets.PLATFORM_API_SECRET }} >> .env + echo VM_API_MODEL=${{ secrets.PLATFORM_DEV_MODEL }} >> .env + cat .env # Only execute the prod notebook if .env file is created - name: Execute prod Intro for Model Developers notebook diff --git a/.github/workflows/deploy-docs-staging.yaml b/.github/workflows/deploy-docs-staging.yaml index f45a06353c..b08982f28c 100644 --- a/.github/workflows/deploy-docs-staging.yaml +++ b/.github/workflows/deploy-docs-staging.yaml @@ -28,8 +28,8 @@ jobs: - name: Render staging docs site run: | - cd site - quarto render --profile staging &> render_errors.log || { + cd site + quarto render --profile staging &> render_errors.log || { echo "Quarto render failed immediately"; cat render_errors.log; exit 1; @@ -39,11 +39,11 @@ jobs: id: create_env run: | touch .env - echo VM_API_HOST=${{ secrets.PLATFORM_API_HOST }} >> .env - echo VM_API_KEY=${{ secrets.PLATFORM_API_KEY }} >> .env - echo VM_API_SECRET=${{ secrets.PLATFORM_API_SECRET }} >> .env - echo VM_API_MODEL=${{ secrets.PLATFORM_DEV_MODEL }} >> .env - cat .env + echo VM_API_HOST=${{ secrets.PLATFORM_API_HOST }} >> .env + echo VM_API_KEY=${{ secrets.PLATFORM_API_KEY }} >> .env + echo VM_API_SECRET=${{ secrets.PLATFORM_API_SECRET }} >> .env + echo VM_API_MODEL=${{ secrets.PLATFORM_DEV_MODEL }} >> .env + cat .env # Only execute the staging notebook if .env file is created - name: Execute staging Intro for Model Developers notebook diff --git a/.github/workflows/validate-docs-site.yaml b/.github/workflows/validate-docs-site.yaml index 8cea51499a..6838a4ca34 100644 --- a/.github/workflows/validate-docs-site.yaml +++ b/.github/workflows/validate-docs-site.yaml @@ -27,8 +27,8 @@ jobs: - name: Render demo docs site run: | - cd site - quarto render --profile development &> render_errors.log || { + cd site + quarto render --profile development &> render_errors.log || { echo "Quarto render failed immediately"; cat render_errors.log; exit 1; @@ -52,11 +52,11 @@ jobs: id: create_env run: | touch .env - echo VM_API_HOST=${{ secrets.PLATFORM_API_HOST }} >> .env - echo VM_API_KEY=${{ secrets.PLATFORM_API_KEY }} >> .env - echo VM_API_SECRET=${{ secrets.PLATFORM_API_SECRET }} >> .env - echo VM_API_MODEL=${{ secrets.PLATFORM_DEV_MODEL }} >> .env - cat .env + echo VM_API_HOST=${{ secrets.PLATFORM_API_HOST }} >> .env + echo VM_API_KEY=${{ secrets.PLATFORM_API_KEY }} >> .env + echo VM_API_SECRET=${{ secrets.PLATFORM_API_SECRET }} >> .env + echo VM_API_MODEL=${{ secrets.PLATFORM_DEV_MODEL }} >> .env + cat .env # Only execute the demo notebook if .env file is created - name: Execute demo Intro for Model Developers notebook @@ -66,7 +66,7 @@ jobs: with: env_file: .env - - name: Test for warnings or errors + - name: Test for warnings or errors run: | if grep -q 'WARN:\|ERROR:' site/render_errors.log; then echo "Warnings or errors detected during Quarto render" @@ -76,7 +76,7 @@ jobs: echo "No warnings or errors detected during Quarto render" fi - # Demo bucket is in us-east-1 + # Demo bucket is in us-east-1 - name: Configure AWS credentials run: aws configure set aws_access_key_id ${{ secrets.AWS_ACCESS_KEY_ID }} && aws configure set aws_secret_access_key ${{ secrets.AWS_SECRET_ACCESS_KEY }} && aws configure set default.region us-east-1 diff --git a/site/guide/monitoring/example-f1-score.png b/site/guide/monitoring/example-f1-score.png new file mode 100644 index 0000000000..f81b1a13be Binary files /dev/null and b/site/guide/monitoring/example-f1-score.png differ diff --git a/site/guide/monitoring/metric-over-time-data.png b/site/guide/monitoring/metric-over-time-data.png index 4c3a264673..5b5bdbfad1 100644 Binary files a/site/guide/monitoring/metric-over-time-data.png and b/site/guide/monitoring/metric-over-time-data.png differ diff --git a/site/guide/monitoring/work-with-metrics-over-time.qmd b/site/guide/monitoring/work-with-metrics-over-time.qmd index 4cd46e67c2..26f6038fe2 100644 --- a/site/guide/monitoring/work-with-metrics-over-time.qmd +++ b/site/guide/monitoring/work-with-metrics-over-time.qmd @@ -5,18 +5,27 @@ date: last-modified Once generated via the {{< var validmind.developer >}}, view and add metrics over time to your ongoing monitoring plans in the {{< var validmind.platform >}}. -Metrics over time refers to the continued monitoring of a model's performance once it is deployed. Tracking how a model performs as new data is introduced or conditions change ensures that it remains accurate and reliable in real-world environments where data distributions or market conditions shift. +Metrics over time refers to the continued monitoring of a model's performance once it is deployed. Tracking how a model performs as new data is introduced or conditions change ensures that it remains accurate and reliable in real-world environments where data distributions or market conditions shift. -- Model performance is determined by continuously measuring metrics and comparing them over time to detect degradation, bias, or shifts in the model's output. -- Performance data is collected and tracked over time, often using a rolling window approach or real-time monitoring tools with the same metrics used in testing, but observed across different periods. +- Model performance is determined by continuously measuring metrics and comparing them over time to detect degradation, bias, or shifts in the model's output. +- Performance data is collected and tracked over time, often using a rolling window approach or real-time monitoring tools with the same metrics used in testing, but observed across different periods. - Continuous tracking helps to identify if and when a model needs to be recalibrated, retrained, or even replaced due to performance deterioration or changing conditions. +::: {.column-margin} +::: {.callout} +## **[Log metrics over time {{< fa hand-point-right >}}](/notebooks/how_to/log_metrics_over_time.ipynb)** + +Learn how to log metrics over time, set thresholds, and analyze model performance trends with our Jupyter Notebook sample. +::: + +::: + ::: {.attn} ## Prerequisites - [x] {{< var link.login >}} -- [x] Metrics over time have already been logged via the {{< var validmind.developer >}} for your model.[^1] +- [x] Metrics over time have already been logged via the {{< var validmind.developer >}} for your model.[^1] - [x] You are a [{{< fa code >}} Developer]{.bubble} or assigned another role with sufficient permissions to perform the tasks in this guide.[^2] ::: @@ -44,7 +53,7 @@ Metrics over time refers to the continued monitoring of a model's performance on - Select the metric over time to insert into the model documentation from the list of available metrics. - Search by name using **{{}} Search** on the top-left to locate specific metrics. - ![Metric over time blocks that have been selected for insertion](metrics-over-time-menu.png){width=90% fig-alt="A screenshot showing several metric over time blocks that have been selected for insertion" .screenshot} + ![Metric Over Time blocks that have been selected for insertion](metrics-over-time-menu.png){fig-alt="A screenshot showing several Metric Over Time blocks that have been selected for insertion" .screenshot group="time-metric"} To preview what is included in a metric, click on it. By default, the actively selected metric is previewed. @@ -52,6 +61,8 @@ Metrics over time refers to the continued monitoring of a model's performance on 8. After inserting the metrics into your document, review the data to confirm that it is accurate and relevant. + ![Example F1 Score — Metric Over Time visualization](example-f1-score.png){fig-alt="A screenshot showing an example F1 Score — Metric Over Time visualization" .screenshot group="time-metric"} + ## View metric over time metadata @@ -60,6 +71,7 @@ After you have added metrics over time to your document, you can view the follow - Date and time the metric was recorded - Who updated the metric - The numeric value of the metric +- The metric's thresholds - Any additional parameters 1. In the left sidebar, click **{{< fa cubes >}} Inventory**. @@ -68,11 +80,11 @@ After you have added metrics over time to your document, you can view the follow 3. In the left sidebar that appears for your model, click **{{< fa book-open >}} Documentation** or **{{< fa desktop >}} Ongoing Monitoring**. -4. Locate the metric whose metadata you want to view. +4. Locate the metric whose metadata you want to view. -5. Under the metric's name, click on **Data** tab. +5. Under the metric's name, click on **Data** tab. - ![](metric-over-time-data.png){width=85% fig-alt="A screenshot showing the Data tab within a metric over time" .screenshot} + ![Example Data tab within a Metric Over Time](metric-over-time-data.png){fig-alt="A screenshot showing an example Data tab within a Metric Over Time" .screenshot} ## What's next @@ -85,7 +97,7 @@ After you have added metrics over time to your document, you can view the follow -[^1]: [Intro to Unit Metrics](/notebooks/how_to/run_unit_metrics.ipynb) +[^1]: [Log metrics over time](/notebooks/how_to/log_metrics_over_time.ipynb) [^2]: [Manage permissions](/guide/configuration/manage-permissions.qmd) diff --git a/site/notebooks.zip b/site/notebooks.zip index b971c34903..f1f3a6f64c 100644 Binary files a/site/notebooks.zip and b/site/notebooks.zip differ diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb b/site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb new file mode 100644 index 0000000000..3ee2b1e6bb --- /dev/null +++ b/site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb @@ -0,0 +1,293 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Document an application scorecard model\n", + "\n", + "Build and document an *application scorecard model* with the ValidMind Library by using Kaggle's [Lending Club](https://www.kaggle.com/datasets/devanshi23/loan-data-2007-2014/data) sample dataset to build a simple application scorecard.\n", + "\n", + "An application scorecard model is a type of statistical model used in credit scoring to evaluate the creditworthiness of potential borrowers by generating a score based on various characteristics of an applicant — such as credit history, income, employment status, and other relevant financial data. \n", + "\n", + "- This score helps lenders make decisions about whether to approve or reject loan applications, as well as determine the terms of the loan, including interest rates and credit limits. \n", + "- Application scorecard models enable lenders to manage risk efficiently while making the loan application process faster and more transparent for applicants.\n", + "\n", + "This interactive notebook provides a step-by-step guide for loading a demo dataset, preprocessing the raw data, training a model for testing, setting up test inputs, initializing the required ValidMind objects, running the test, and then logging the results to ValidMind." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language.\n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "If you haven't already seen our [Get started with the ValidMind Library](https://docs.validmind.ai/developer/get-started-validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "\n", + "
For access to all features available in this notebook, create a free ValidMind account.\n", + "

\n", + "Signing up is FREE — Register with ValidMind
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: The [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Initialize the ValidMind Library\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "\n", + "\n", + "### Get your code snippet\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Model Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + " For example, to register a model for use with this notebook, select:\n", + "\n", + " - Documentation template: `Credit Risk Scorecard`\n", + " - Use case: `Credit Risk - CECL`\n", + "\n", + " You can fill in other options according to your preference.\n", + "\n", + "4. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import validmind as vm\n", + "\n", + "vm.init(\n", + " api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", + " api_key = \"...\",\n", + " api_secret = \"...\",\n", + " model = \"...\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Document the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.datasets.credit_risk import lending_club\n", + "from validmind.utils import preview_test_config\n", + "\n", + "scorecard = lending_club.load_scorecard()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "lending_club.init_vm_objects(scorecard)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_config = lending_club.load_test_config(scorecard)\n", + "preview_test_config(test_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.run_documentation_tests(config=test_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps\n", + "\n", + "You can look at the results of this test suite right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation.\n", + "\n", + "\n", + "\n", + "### Work with your model documentation\n", + "\n", + "1. In the ValidMind Platform, go to the **Documentation** page for the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-documentation/working-with-model-documentation.html))\n", + "\n", + "2. Expand the following sections and take a look around:\n", + "\n", + " - **2. Data Preparation**\n", + " - **3. Model Development**\n", + "\n", + "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation (hint: some of the tests in **2.3. Feature Selection and Engineering** look like they need some attention), view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready.\n", + "\n", + "\n", + "\n", + "### Discover more learning resources\n", + "\n", + "We offer many interactive notebooks to help you document models:\n", + "\n", + "- [Run tests & test suites](https://docs.validmind.ai/developer/model-testing/testing-overview.html)\n", + "- [Code samples](https://docs.validmind.ai/developer/samples-jupyter-notebooks.html)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-eEL8LtKG-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb b/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb index 1e956cc1aa..750ebc9672 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb +++ b/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb @@ -605,7 +605,7 @@ "\n", " For each metric in the test results, include in the test overview:\n", " - The metric's purpose and what it measures\n", - " - Its mathematical formula in LaTeX notation\n", + " - Its mathematical formula\n", " - The range of possible values\n", " - What constitutes good/bad performance\n", " - How to interpret different values\n", diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb b/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb index 03a6180b83..26a983f10d 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb +++ b/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb @@ -545,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -606,7 +606,7 @@ "\n", " For each metric in the test results, include in the test overview:\n", " - The metric's purpose and what it measures\n", - " - Its mathematical formula in LaTeX notation\n", + " - Its mathematical formula\n", " - The range of possible values\n", " - What constitutes good/bad performance\n", " - How to interpret different values\n", @@ -648,15 +648,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.DatasetDescription:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.DatasetDescription:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + ").log()" ] }, { @@ -665,15 +662,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.DescriptiveStatistics:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.DescriptiveStatistics:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " }\n", + ").log()" ] }, { @@ -682,18 +676,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.MissingValues:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"min_threshold\": 1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.MissingValues:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"min_threshold\": 1\n", + " }\n", + ").log()" ] }, { @@ -702,18 +693,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.ClassImbalance:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"min_percent_threshold\": 10\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.ClassImbalance:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"min_percent_threshold\": 10\n", + " }\n", + ").log()" ] }, { @@ -722,18 +710,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.Duplicates:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"min_threshold\": 1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.Duplicates:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"min_threshold\": 1\n", + " }\n", + ").log()" ] }, { @@ -742,20 +727,17 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.HighCardinality:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"num_threshold\": 100,\n", - " \"percent_threshold\": 0.1,\n", - " \"threshold_type\": \"percent\"\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.HighCardinality:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"num_threshold\": 100,\n", + " \"percent_threshold\": 0.1,\n", + " \"threshold_type\": \"percent\"\n", + " }\n", + ").log()" ] }, { @@ -764,18 +746,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.Skewness:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"max_threshold\": 1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.Skewness:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"max_threshold\": 1\n", + " }\n", + ").log()" ] }, { @@ -784,18 +763,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.UniqueRows:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"min_percent_threshold\": 1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.UniqueRows:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"min_percent_threshold\": 1\n", + " }\n", + ").log()" ] }, { @@ -804,18 +780,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TooManyZeroValues:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"max_percent_threshold\": 0.03\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TooManyZeroValues:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"max_percent_threshold\": 0.03\n", + " }\n", + ").log()" ] }, { @@ -824,18 +797,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.IQROutliersTable:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"threshold\": 5\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.IQROutliersTable:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"threshold\": 5\n", + " }\n", + ").log()" ] }, { @@ -853,15 +823,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.DescriptiveStatistics:preprocessed_data\",\n", - " inputs={\n", - " \"dataset\": vm_preprocess_dataset,\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.DescriptiveStatistics:preprocessed_data\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset,\n", + " }\n", + ").log()" ] }, { @@ -870,15 +837,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TabularDescriptionTables:preprocessed_data\",\n", - " inputs={\n", - " \"dataset\": vm_preprocess_dataset\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TabularDescriptionTables:preprocessed_data\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset\n", + " },\n", + ").log()" ] }, { @@ -887,18 +851,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.MissingValues:preprocessed_data\",\n", - " inputs={\n", - " \"dataset\": vm_preprocess_dataset,\n", - " },\n", - " params={\n", - " \"min_threshold\": 1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.MissingValues:preprocessed_data\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset,\n", + " },\n", + " params={\n", + " \"min_threshold\": 1\n", + " }\n", + ").log()" ] }, { @@ -907,15 +868,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TabularNumericalHistograms:preprocessed_data\",\n", - " inputs={\n", - " \"dataset\": vm_preprocess_dataset\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TabularNumericalHistograms:preprocessed_data\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset\n", + " },\n", + ").log()" ] }, { @@ -924,15 +882,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TabularCategoricalBarPlots:preprocessed_data\",\n", - " inputs={\n", - " \"dataset\": vm_preprocess_dataset\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TabularCategoricalBarPlots:preprocessed_data\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset\n", + " },\n", + ").log()" ] }, { @@ -941,18 +896,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TargetRateBarPlots:preprocessed_data\",\n", - " inputs={\n", - " \"dataset\": vm_preprocess_dataset\n", - " },\n", - " params={\n", - " \"default_column\": lending_club.target_column,\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TargetRateBarPlots:preprocessed_data\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset\n", + " },\n", + " params={\n", + " \"default_column\": lending_club.target_column,\n", + " },\n", + ").log()" ] }, { @@ -968,15 +920,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.DescriptiveStatistics:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.DescriptiveStatistics:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " }\n", + ").log()" ] }, { @@ -985,15 +934,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TabularDescriptionTables:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TabularDescriptionTables:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + ").log()" ] }, { @@ -1002,18 +948,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.ClassImbalance:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"min_percent_threshold\": 10\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.ClassImbalance:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"min_percent_threshold\": 10\n", + " }\n", + ").log()" ] }, { @@ -1022,18 +965,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.UniqueRows:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"min_percent_threshold\": 1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.UniqueRows:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"min_percent_threshold\": 1\n", + " }\n", + ").log()" ] }, { @@ -1042,15 +982,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TabularNumericalHistograms:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TabularNumericalHistograms:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + ").log()" ] }, { @@ -1066,18 +1003,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.MutualInformation:development_data\",\n", - " input_grid ={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"min_threshold\": 0.01,\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.MutualInformation:development_data\",\n", + " input_grid ={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"min_threshold\": 0.01,\n", + " },\n", + ").log()" ] }, { @@ -1086,15 +1020,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.PearsonCorrelationMatrix:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.PearsonCorrelationMatrix:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " }\n", + ").log()" ] }, { @@ -1103,19 +1034,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.HighPearsonCorrelation:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"max_threshold\": 0.3,\n", - " \"top_n_correlations\": 10\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.HighPearsonCorrelation:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"max_threshold\": 0.3,\n", + " \"top_n_correlations\": 10\n", + " }\n", + ").log()" ] }, { @@ -1124,18 +1052,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.WOEBinTable\",\n", - " input_grid={\n", - " \"dataset\": [vm_preprocess_dataset]\n", - " },\n", - " params={\n", - " \"breaks_adj\": lending_club.breaks_adj,\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.WOEBinTable\",\n", + " input_grid={\n", + " \"dataset\": [vm_preprocess_dataset]\n", + " },\n", + " params={\n", + " \"breaks_adj\": lending_club.breaks_adj,\n", + " },\n", + ").log()" ] }, { @@ -1144,18 +1069,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.WOEBinPlots\",\n", - " input_grid={\n", - " \"dataset\": [vm_preprocess_dataset]\n", - " },\n", - " params={\n", - " \"breaks_adj\": lending_club.breaks_adj,\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.WOEBinPlots\",\n", + " input_grid={\n", + " \"dataset\": [vm_preprocess_dataset]\n", + " },\n", + " params={\n", + " \"breaks_adj\": lending_club.breaks_adj,\n", + " },\n", + ").log()" ] }, { @@ -1173,15 +1095,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.DatasetSplit\",\n", - " inputs={\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.DatasetSplit\",\n", + " inputs={\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " },\n", + ").log()" ] }, { @@ -1190,15 +1109,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.ModelMetadata\",\n", - " input_grid={\n", - " \"model\": [vm_xgb_model, vm_rf_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.ModelMetadata\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model, vm_rf_model],\n", + " },\n", + ").log()" ] }, { @@ -1207,15 +1123,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.ModelParameters\",\n", - " input_grid={\n", - " \"model\": [vm_xgb_model, vm_rf_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.ModelParameters\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model, vm_rf_model],\n", + " },\n", + ").log()" ] }, { @@ -1231,16 +1144,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.statsmodels.GINITable\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model, vm_rf_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.statsmodels.GINITable\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model, vm_rf_model],\n", + " },\n", + ").log()" ] }, { @@ -1249,16 +1159,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.ClassifierPerformance\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model, vm_rf_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.ClassifierPerformance\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model, vm_rf_model],\n", + " },\n", + ").log()" ] }, { @@ -1267,19 +1174,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.TrainingTestDegradation:XGBoost\",\n", - " inputs={\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " \"model\": vm_xgb_model,\n", - " },\n", - " params={\n", - " \"max_threshold\": 0.1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.TrainingTestDegradation:XGBoost\",\n", + " inputs={\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"max_threshold\": 0.1\n", + " }\n", + ").log()" ] }, { @@ -1288,19 +1192,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.TrainingTestDegradation:RandomForest\",\n", - " inputs={\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " \"model\": vm_rf_model,\n", - " },\n", - " params={\n", - " \"max_threshold\": 0.1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.TrainingTestDegradation:RandomForest\",\n", + " inputs={\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " \"model\": vm_rf_model,\n", + " },\n", + " params={\n", + " \"max_threshold\": 0.1\n", + " }\n", + ").log()" ] }, { @@ -1309,23 +1210,19 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " # Run the test\n", - " result = run_test(\n", - " \"validmind.model_validation.sklearn.HyperParametersTuning\",\n", - " inputs={\n", - " \"model\": vm_xgb_model,\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - " params={\n", - " \"param_grid\": {'n_estimators': [50, 100]},\n", - " \"scoring\": ['roc_auc', 'recall'],\n", - " \"fit_params\": {'eval_set': [(x_test, y_test)], 'verbose': False},\n", - " \"thresholds\": [0.3, 0.5],\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.HyperParametersTuning\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + " params={\n", + " \"param_grid\": {'n_estimators': [50, 100]},\n", + " \"scoring\": ['roc_auc', 'recall'],\n", + " \"fit_params\": {'eval_set': [(x_test, y_test)], 'verbose': False},\n", + " \"thresholds\": [0.3, 0.5],\n", + " }\n", + ").log()" ] }, { @@ -1343,16 +1240,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.ROCCurve\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.ROCCurve\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + ").log()" ] }, { @@ -1361,19 +1255,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.MinimumROCAUCScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " params={\n", - " \"min_threshold\": 0.5\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.MinimumROCAUCScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + " params={\n", + " \"min_threshold\": 0.5\n", + " }\n", + ").log()" ] }, { @@ -1382,16 +1273,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.statsmodels.PredictionProbabilitiesHistogram\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.statsmodels.PredictionProbabilitiesHistogram\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + ").log()" ] }, { @@ -1400,16 +1288,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.statsmodels.CumulativePredictionProbabilities\",\n", - " input_grid={\n", - " \"model\": [vm_xgb_model],\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.statsmodels.CumulativePredictionProbabilities\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model],\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + ").log()" ] }, { @@ -1418,20 +1303,17 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.PopulationStabilityIndex\",\n", - " inputs={\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " \"model\": vm_xgb_model,\n", - " },\n", - " params={\n", - " \"num_bins\": 10,\n", - " \"mode\": \"fixed\"\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.PopulationStabilityIndex\",\n", + " inputs={\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"num_bins\": 10,\n", + " \"mode\": \"fixed\"\n", + " }\n", + ").log()" ] }, { @@ -1447,19 +1329,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.ClassifierThresholdOptimization\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " \"model\": vm_xgb_model\n", - " },\n", - " params={\n", - " \"target_recall\": 0.8 # Find a threshold that achieves a recall of 80%\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.ClassifierThresholdOptimization\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " \"model\": vm_xgb_model\n", + " },\n", + " params={\n", + " \"target_recall\": 0.8 # Find a threshold that achieves a recall of 80%\n", + " }\n", + ").log()" ] }, { @@ -1468,16 +1347,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.CalibrationCurve\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.CalibrationCurve\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + ").log()" ] }, { @@ -1486,16 +1362,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.ConfusionMatrix\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.ConfusionMatrix\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + ").log()" ] }, { @@ -1504,19 +1377,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.MinimumAccuracy\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " params={\n", - " \"min_threshold\": 0.7\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.MinimumAccuracy\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + " params={\n", + " \"min_threshold\": 0.7\n", + " }\n", + ").log()" ] }, { @@ -1525,19 +1395,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.MinimumF1Score\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " params={\n", - " \"min_threshold\": 0.5\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.MinimumF1Score\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + " params={\n", + " \"min_threshold\": 0.5\n", + " }\n", + ").log()" ] }, { @@ -1546,16 +1413,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.PrecisionRecallCurve\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model]\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.PrecisionRecallCurve\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model]\n", + " },\n", + ").log()" ] }, { @@ -1571,16 +1435,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.WeakspotsDiagnosis\",\n", - " inputs={\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " \"model\": vm_xgb_model,\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.WeakspotsDiagnosis\",\n", + " inputs={\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + ").log()" ] }, { @@ -1589,19 +1450,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.OverfitDiagnosis\",\n", - " inputs={\n", - " \"model\": vm_xgb_model,\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"cut_off_threshold\": 0.04\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.OverfitDiagnosis\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"cut_off_threshold\": 0.04\n", + " }\n", + ").log()" ] }, { @@ -1610,26 +1468,23 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.RobustnessDiagnosis\",\n", - " inputs={\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " \"model\": vm_xgb_model,\n", - " },\n", - " params={\n", - " \"scaling_factor_std_dev_list\": [\n", - " 0.1,\n", - " 0.2,\n", - " 0.3,\n", - " 0.4,\n", - " 0.5\n", - " ],\n", - " \"performance_decay_threshold\": 0.05\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.RobustnessDiagnosis\",\n", + " inputs={\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"scaling_factor_std_dev_list\": [\n", + " 0.1,\n", + " 0.2,\n", + " 0.3,\n", + " 0.4,\n", + " 0.5\n", + " ],\n", + " \"performance_decay_threshold\": 0.05\n", + " }\n", + ").log()" ] }, { @@ -1647,16 +1502,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.PermutationFeatureImportance\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model]\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.PermutationFeatureImportance\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model]\n", + " }\n", + ").log()" ] }, { @@ -1665,16 +1517,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.FeaturesAUC\",\n", - " input_grid={\n", - " \"model\": [vm_xgb_model],\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.FeaturesAUC\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model],\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + ").log()" ] }, { @@ -1683,20 +1532,17 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.SHAPGlobalImportance\",\n", - " input_grid={\n", - " \"model\": [vm_xgb_model],\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"kernel_explainer_samples\": 10,\n", - " \"tree_or_linear_explainer_samples\": 200,\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.SHAPGlobalImportance\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model],\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"kernel_explainer_samples\": 10,\n", + " \"tree_or_linear_explainer_samples\": 200,\n", + " }\n", + ").log()" ] }, { @@ -1712,18 +1558,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.statsmodels.ScorecardHistogram\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"score_column\": \"xgb_scores\",\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.statsmodels.ScorecardHistogram\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"score_column\": \"xgb_scores\",\n", + " },\n", + ").log()" ] }, { @@ -1732,20 +1575,115 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", + "run_test(\n", + " \"validmind.data_validation.ScoreBandDefaultRates\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + " params = {\n", + " \"score_column\": \"xgb_scores\",\n", + " \"score_bands\": [500, 540, 570]\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.ScoreProbabilityAlignment\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + " params={\n", + " \"score_column\": \"xgb_scores\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Custom tests\n", + "\n", + "Custom tests extend the functionality of ValidMind, allowing you to document any model or use case with added flexibility.\n", "\n", - " run_test(\n", - " \"validmind.data_validation.ScoreBandDefaultRates\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " params = {\n", - " \"score_column\": \"xgb_scores\",\n", - " \"score_bands\": [500, 540, 570]\n", - " }\n", - " ).log()" + "ValidMind provides a comprehensive set of tests out-of-the-box to evaluate and document your models and datasets. We recognize there will be cases where the default tests do not support a model or dataset, or specific documentation is needed. In these cases, you can create and use your own custom code to accomplish what you need. To streamline custom code integration, we support the creation of custom test functions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### In-line custom tests\n", + "\n", + "The `@vm.test` decorator is doing the work of creating a wrapper around the function that will allow it to be run by the ValidMind Library. It also registers the test so it can be found by the ID `my_custom_tests.ScoreToOdds\"`. The function `score_to_odds_analysis` takes three arguments `dataset`, `score_column`, and `score_bands`. This is a `VMDataset` and the rest are parameters that can be passed in." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "\n", + "\n", + "@vm.test(\"my_custom_tests.ScoreToOdds\")\n", + "def score_to_odds_analysis(dataset, score_column='score', score_bands=[410, 440, 470]):\n", + " \"\"\"\n", + " Analyzes the relationship between score bands and odds (good:bad ratio).\n", + " Good odds = (1 - default_rate) / default_rate\n", + " \n", + " Higher scores should correspond to higher odds of being good.\n", + " \"\"\"\n", + " df = dataset.df\n", + " \n", + " # Create score bands\n", + " df['score_band'] = pd.cut(\n", + " df[score_column],\n", + " bins=[-np.inf] + score_bands + [np.inf],\n", + " labels=[f'<{score_bands[0]}'] + \n", + " [f'{score_bands[i]}-{score_bands[i+1]}' for i in range(len(score_bands)-1)] +\n", + " [f'>{score_bands[-1]}']\n", + " )\n", + " \n", + " # Calculate metrics per band\n", + " results = df.groupby('score_band').agg({\n", + " dataset.target_column: ['mean', 'count']\n", + " })\n", + " \n", + " results.columns = ['Default Rate', 'Total']\n", + " results['Good Count'] = results['Total'] - (results['Default Rate'] * results['Total'])\n", + " results['Bad Count'] = results['Default Rate'] * results['Total']\n", + " results['Odds'] = results['Good Count'] / results['Bad Count']\n", + " \n", + " # Create visualization\n", + " fig = go.Figure()\n", + " \n", + " # Add odds bars\n", + " fig.add_trace(go.Bar(\n", + " name='Odds (Good:Bad)',\n", + " x=results.index,\n", + " y=results['Odds'],\n", + " marker_color='blue'\n", + " ))\n", + " \n", + " fig.update_layout(\n", + " title='Score-to-Odds Analysis',\n", + " yaxis=dict(title='Odds Ratio (Good:Bad)'),\n", + " showlegend=False\n", + " )\n", + " \n", + " return fig" ] }, { @@ -1754,19 +1692,71 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", + "run_test(\n", + " \"my_custom_tests.ScoreToOdds\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " params={\n", + " \"score_column\": \"xgb_scores\",\n", + " \"score_bands\": [500, 540, 570],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Local test provider\n", + "\n", + "The ValidMind Library offers the ability to extend the built-in library of tests with custom tests. A test \"Provider\" is a Python class that gets registered with the ValidMind Library and loads tests based on a test ID, for example `my_test_provider.my_test_id`. The built-in suite of tests that ValidMind offers is technically its own test provider. You can use one the built-in test provider offered by ValidMind (`validmind.tests.test_providers.LocalTestProvider`) or you can create your own. More than likely, you'll want to use the `LocalTestProvider` to add a directory of custom tests but there's flexibility to be able to load tests from any source." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import LocalTestProvider\n", "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.ScoreProbabilityAlignment\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " params={\n", - " \"score_column\": \"xgb_scores\",\n", - " },\n", - " ).log()" + "# Define the folder where your tests are located\n", + "tests_folder = \"custom_tests\"\n", + "\n", + "# initialize the test provider with the tests folder we created earlier\n", + "my_test_provider = LocalTestProvider(tests_folder)\n", + "\n", + "vm.tests.register_test_provider(\n", + " namespace=\"my_test_provider\",\n", + " test_provider=my_test_provider,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have our test provider set up, we can run any test that's located in our tests folder by using the `run_test()` method. This function is your entry point to running single tests in the ValidMind Library. It takes a test ID and runs the test associated with that ID. For our custom tests, the test ID will be the `namespace` specified when registering the provider, followed by the path to the test file relative to the tests folder. For example, the Confusion Matrix test we created earlier will have the test ID `my_test_provider.ConfusionMatrix`. You could organize the tests in subfolders, say `classification` and `regression`, and the test ID for the Confusion Matrix test would then be `my_test_provider.classification.ConfusionMatrix`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"my_test_provider.ScoreBandDiscriminationMetrics\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"score_column\": \"xgb_scores\",\n", + " \"score_bands\": [500, 540, 570],\n", + " }\n", + ").log(section_id=\"interpretability_insights\")" ] }, { diff --git a/site/notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py b/site/notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py new file mode 100644 index 0000000000..62127d82ec --- /dev/null +++ b/site/notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py @@ -0,0 +1,193 @@ +# Copyright © 2023-2024 ValidMind Inc. All rights reserved. +# See the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial + +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from sklearn.metrics import roc_curve, roc_auc_score +from typing import Tuple +from validmind import tags, tasks +from validmind.vm_models import VMDataset, VMModel + + +@tags("visualization", "credit_risk", "scorecard") +@tasks("classification") +def ScoreBandDiscriminationMetrics( + dataset: VMDataset, + model: VMModel, + score_column: str = "score", + score_bands: list = None, + title: str = "Score Band Discrimination Metrics", +) -> Tuple[go.Figure, pd.DataFrame]: + """ + Evaluates discrimination metrics (AUC, GINI, KS) across different score bands for credit risk assessment. + + ### Purpose + + The Score Band Discrimination Metrics test is designed to evaluate the model's discriminatory power across + different score ranges. By segmenting the score distribution into bands and calculating key discrimination + metrics within each band, this test helps identify whether the model maintains consistent performance across + the entire score spectrum. This is crucial for understanding if the model's ability to separate good and bad + accounts varies significantly across different score ranges. + + ### Test Mechanism + + This test proceeds by first segmenting the score distribution into predefined bands. For each band, it + calculates three key discrimination metrics: AUC (Area Under the Curve), GINI coefficient, and KS + (Kolmogorov-Smirnov) statistic. The AUC measures the model's ability to rank order risk, the GINI + coefficient provides a measure of inequality in the predictions, and the KS statistic quantifies the maximum + separation between cumulative distributions. The test also tracks the population distribution and default + rates across bands to provide context for the discrimination metrics. + + ### Signs of High Risk + + - Significant variations in discrimination metrics between adjacent score bands + - Very low metric values in specific score ranges, indicating poor discrimination + - Inconsistent patterns in metric values across the score spectrum + - Large disparities between band-specific metrics and overall metrics + - Unexpected relationships between default rates and discrimination metrics + - Insufficient population in certain score bands for reliable metric calculation + + ### Strengths + + - Provides a comprehensive view of model discrimination across the score spectrum + - Combines multiple complementary metrics for robust performance assessment + - Identifies specific score ranges where model performance might be suboptimal + - Includes population and default rate context for better interpretation + - Handles edge cases such as single-class bands and insufficient data + - Enables visual comparison of metrics across score bands + + ### Limitations + + - Requires sufficient data in each score band for reliable metric calculation + - May be sensitive to the choice of score band boundaries + - Does not account for business importance of different score ranges + - Metrics may be unstable in bands with very low default rates + - Cannot directly suggest optimal score band boundaries + - Limited to assessing discrimination aspects of model performance + """ + if score_column not in dataset.df.columns: + raise ValueError(f"Score column '{score_column}' not found in dataset") + + df = dataset.df.copy() + + # Default score bands if none provided + if score_bands is None: + score_bands = [410, 440, 470] + + # Create band labels + band_labels = [ + f"{score_bands[i]}-{score_bands[i+1]}" for i in range(len(score_bands) - 1) + ] + band_labels.insert(0, f"<{score_bands[0]}") + band_labels.append(f">{score_bands[-1]}") + + # Bin the scores + df["score_band"] = pd.cut( + df[score_column], bins=[-np.inf] + score_bands + [np.inf], labels=band_labels + ) + + # Calculate metrics for each band + results = [] + for band in band_labels: + band_mask = df["score_band"] == band + if band_mask.sum() > 1: # Need at least 2 samples + y_true = df[band_mask][dataset.target_column].values + y_prob = dataset.y_prob(model)[ + band_mask + ] # Get predicted probabilities using dataset method + + # Convert to float arrays + y_true = np.array(y_true, dtype=float) + y_prob = np.array(y_prob, dtype=float) + + # Calculate metrics + try: + fpr, tpr, _ = roc_curve(y_true, y_prob) + ks = max(tpr - fpr) + auc = roc_auc_score(y_true, y_prob) + gini = 2 * auc - 1 + except ValueError: # Handle cases with single class + ks, auc, gini = 0, 0.5, 0 + + results.append( + { + "Score Band": band, + "Population Count": band_mask.sum(), + "Population (%)": (band_mask.sum() / len(df)) * 100, + "AUC": auc, + "GINI": gini, + "KS": ks, + "Default Rate (%)": (y_true.mean() * 100), + } + ) + + # Calculate total metrics + y_true = df[dataset.target_column].values + y_prob = dataset.y_prob(model) # Get predicted probabilities for total calculation + + fpr, tpr, _ = roc_curve(y_true, y_prob) + total_ks = max(tpr - fpr) + total_auc = roc_auc_score(y_true, y_prob) + total_gini = 2 * total_auc - 1 + + # Add total row + results.append( + { + "Score Band": f"Total ({df[score_column].min():.0f}-{df[score_column].max():.0f})", + "Population Count": len(df), + "Population (%)": 100.0, + "AUC": total_auc, + "GINI": total_gini, + "KS": total_ks, + "Default Rate (%)": (y_true.mean() * 100), + } + ) + + results_df = pd.DataFrame(results) + + # Create visualization (excluding total) + fig = go.Figure() + + # Filter out the total row for plotting + plot_df = results_df[results_df["Score Band"].str.contains("Total") == False] + + # Add metric bars + for metric, color in [ + ("AUC", "rgb(31, 119, 180)"), + ("GINI", "rgb(255, 127, 14)"), + ("KS", "rgb(44, 160, 44)"), + ]: + fig.add_trace( + go.Bar( + name=metric, + x=plot_df["Score Band"], + y=plot_df[metric], + marker_color=color, + ) + ) + + # Add default rate line (excluding total) + fig.add_trace( + go.Scatter( + name="Default Rate (%)", + x=plot_df["Score Band"], + y=plot_df["Default Rate (%)"], + yaxis="y2", + line=dict(color="red", width=2), + ) + ) + + # Update layout + fig.update_layout( + title=title, + xaxis_title="Score Band", + yaxis_title="Discrimination Metrics", + yaxis2=dict(title="Default Rate (%)", overlaying="y", side="right"), + barmode="group", + showlegend=True, + height=600, + ) + + return fig, results_df diff --git a/site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb b/site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb new file mode 100644 index 0000000000..ab5d6d4bf6 --- /dev/null +++ b/site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb @@ -0,0 +1,1215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ongoing Monitoring for Application Scorecard \n", + "\n", + "In this notebook, you'll learn how to seamlessly monitor your production models using the ValidMind Platform.\n", + "\n", + "We'll walk you through the process of initializing the ValidMind Library, loading a sample dataset and model, and running a monitoring test suite to quickly generate documentation about your new data and model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation, validation, monitoring tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our [Get started with the ValidMind Library](https://docs.validmind.ai/developer/get-started-validmind-library.html), we recommend you explore the available resources for developers at some point. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "\n", + "
For access to all features available in this notebook, create a free ValidMind account.\n", + "

\n", + "Signing up is FREE — Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Model monitoring documentation**: A comprehensive and structured record of a production model, including key elements such as data sources, inputs, performance metrics, and periodic evaluations. This documentation ensures transparency and visibility of the model's performance in the production environment.\n", + "\n", + "**Monitoring documentation template**: Similar to documentation template, The monitoring documentation template functions as a test suite and lays out the structure of model monitoring documentation, segmented into various sections and sub-sections. Monitoring documentation templates define the structure of your model monitoring documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize the ValidMind Library\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "\n", + "\n", + "### Get your code snippet\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Model Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + " For example, to register a model for use with this notebook, select:\n", + "\n", + " - Documentation template: `Binary classification`\n", + " - Use case: `Marketing/Sales - Attrition/Churn Management`\n", + "\n", + " You can fill in other options according to your preference.\n", + "\n", + "4. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", + " api_key = \"...\",\n", + " api_secret = \"...\",\n", + " model = \"...\",\n", + " monitoring = True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize the Python environment\n", + "\n", + "Next, let's import the necessary libraries and set up your Python environment for data analysis:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "import numpy as np\n", + "\n", + "from datetime import datetime, timedelta\n", + "\n", + "from validmind.tests import run_test\n", + "from validmind.datasets.credit_risk import lending_club\n", + "from validmind.unit_metrics import list_metrics\n", + "from validmind.unit_metrics import describe_metric\n", + "from validmind.unit_metrics import run_metric\n", + "from validmind.api_client import log_metric\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preview the monitoring template\n", + "\n", + "A template predefines sections for your model monitoring documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "You will upload documentation and test results into this template later on. For now, take a look at the structure that the template provides with the `vm.preview_template()` function from the ValidMind library and note the empty sections:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the reference and monitoring datasets\n", + "\n", + "The sample dataset used here is provided by the ValidMind library. For demonstration purposes we'll use the training, test dataset splits as `reference` and `monitoring` datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = lending_club.load_data(source=\"offline\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preprocess_df = lending_club.preprocess(df)\n", + "preprocess_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fe_df = lending_club.feature_engineering(preprocess_df)\n", + "fe_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train the model\n", + "\n", + "In this section, we focus on constructing and refining our predictive model. \n", + "- We begin by dividing our data, which is based on Weight of Evidence (WoE) features, into training and testing sets (`train_df`, `test_df`). \n", + "- With `lending_club.split`, we employ a simple random split, randomly allocating data points to each set to ensure a mix of examples in both." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Split the data\n", + "train_df, test_df = lending_club.split(fe_df, test_size=0.2)\n", + "\n", + "x_train = train_df.drop(lending_club.target_column, axis=1)\n", + "y_train = train_df[lending_club.target_column]\n", + "\n", + "x_test = test_df.drop(lending_club.target_column, axis=1)\n", + "y_test = test_df[lending_club.target_column]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the XGBoost model\n", + "xgb_model = xgb.XGBClassifier(\n", + " n_estimators=50, \n", + " random_state=42, \n", + " early_stopping_rounds=10\n", + ")\n", + "xgb_model.set_params(\n", + " eval_metric=[\"error\", \"logloss\", \"auc\"],\n", + ")\n", + "\n", + "# Fit the model\n", + "xgb_model.fit(\n", + " x_train, \n", + " y_train,\n", + " eval_set=[(x_test, y_test)],\n", + " verbose=False\n", + ")\n", + "\n", + "# Compute probabilities\n", + "train_xgb_prob = xgb_model.predict_proba(x_train)[:, 1]\n", + "test_xgb_prob = xgb_model.predict_proba(x_test)[:, 1]\n", + "\n", + "# Compute binary predictions\n", + "cut_off_threshold = 0.3\n", + "train_xgb_binary_predictions = (train_xgb_prob > cut_off_threshold).astype(int)\n", + "test_xgb_binary_predictions = (test_xgb_prob > cut_off_threshold).astype(int)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialize the ValidMind datasets\n", + "\n", + "Before you can run tests, you must first initialize a ValidMind dataset object using the [`init_dataset`](https://docs.validmind.ai/validmind/validmind.html#init_dataset) function from the ValidMind (`vm`) module.\n", + "\n", + "This function takes a number of arguments:\n", + "\n", + "- `dataset` — The raw dataset that you want to provide as input to tests.\n", + "- `input_id` - A unique identifier that allows tracking what inputs are used when running each individual test.\n", + "- `target_column` — A required argument if tests require access to true values. This is the name of the target column in the dataset.\n", + "\n", + "With all datasets ready, you can now initialize training, reference(test) and monitor datasets (`reference_df` and `monitor_df`) created earlier into their own dataset objects using [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset):" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "vm_reference_ds = vm.init_dataset(\n", + " dataset=train_df,\n", + " input_id=\"reference_dataset\",\n", + " target_column=lending_club.target_column,\n", + ")\n", + "\n", + "vm_monitoring_ds = vm.init_dataset(\n", + " dataset=test_df,\n", + " input_id=\"monitoring_dataset\",\n", + " target_column=lending_club.target_column,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialize a model object\n", + "\n", + "You will also need to initialize a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data. You simply intialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model):" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "vm_xgb_model = vm.init_model(\n", + " xgb_model,\n", + " input_id=\"xgb_model\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Assign prediction values and probabilities to the datasets\n", + "\n", + "With our model now trained, we'll move on to assigning both the predictive probabilities coming directly from the model's predictions, and the binary prediction after applying the cutoff threshold described in the previous steps. \n", + "- These tasks are achieved through the use of the `assign_predictions()` method associated with the VM `dataset` object.\n", + "- This method links the model's class prediction values and probabilities to our VM train and test datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "vm_reference_ds.assign_predictions(\n", + " model=vm_xgb_model,\n", + " prediction_values=train_xgb_binary_predictions,\n", + " prediction_probabilities=train_xgb_prob,\n", + ")\n", + "\n", + "vm_monitoring_ds.assign_predictions(\n", + " model=vm_xgb_model,\n", + " prediction_values=test_xgb_binary_predictions,\n", + " prediction_probabilities=test_xgb_prob,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compute credit risk scores\n", + "\n", + "In this phase, we translate model predictions into actionable scores using probability estimates generated by our trained model." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "train_xgb_scores = lending_club.compute_scores(train_xgb_prob)\n", + "test_xgb_scores = lending_club.compute_scores(test_xgb_prob)\n", + "\n", + "# Assign scores to the datasets\n", + "vm_reference_ds.add_extra_column(\"xgb_scores\", train_xgb_scores)\n", + "vm_monitoring_ds.add_extra_column(\"xgb_scores\", test_xgb_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Adding custom context to the LLM descriptions\n", + "\n", + "To enable the LLM descriptions context, you need to set the `VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED` environment variable to `1`. This will enable the LLM descriptions context, which will be used to provide additional context to the LLM descriptions. This is a global setting that will affect all tests." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED\"] = \"1\"\n", + "\n", + "context = \"\"\"\n", + "FORMAT FOR THE LLM DESCRIPTIONS: \n", + " **** is designed to .\n", + "\n", + " The test operates by \n", + "\n", + " The primary advantages of this test include \n", + "\n", + " Users should be aware that \n", + "\n", + " **Key Insights:**\n", + "\n", + " The test results reveal:\n", + "\n", + " - ****: \n", + " - ****: \n", + " ...\n", + "\n", + " Based on these results, \n", + "\n", + "ADDITIONAL INSTRUCTIONS:\n", + " Present insights in order from general to specific, with each insight as a single bullet point with bold title.\n", + "\n", + " For each metric in the test results, include in the test overview:\n", + " - The metric's purpose and what it measures\n", + " - Its mathematical formula\n", + " - The range of possible values\n", + " - What constitutes good/bad performance\n", + " - How to interpret different values\n", + "\n", + " Each insight should progressively cover:\n", + " 1. Overall scope and distribution\n", + " 2. Complete breakdown of all elements with specific values\n", + " 3. Natural groupings and patterns\n", + " 4. Comparative analysis between datasets/categories\n", + " 5. Stability and variations\n", + " 6. Notable relationships or dependencies\n", + "\n", + " Remember:\n", + " - Keep all insights at the same level (no sub-bullets or nested structures)\n", + " - Make each insight complete and self-contained\n", + " - Include specific numerical values and ranges\n", + " - Cover all elements in the results comprehensively\n", + " - Maintain clear, concise language\n", + " - Use only \"- **Title**: Description\" format for insights\n", + " - Progress naturally from general to specific observations\n", + "\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monitoring data description\n", + "\n", + "The Monitoring Data Description tests aim to provide a comprehensive statistical analysis of the monitoring dataset's characteristics. These tests examine the basic statistical properties, identify any missing data patterns, assess data uniqueness, visualize numerical feature distributions, and evaluate feature relationships through correlation analysis.\n", + "\n", + "The primary objective is to establish a baseline understanding of the monitoring data's structure and quality, enabling the detection of any significant deviations from expected patterns that could impact model performance. Each test is designed to capture different aspects of the data, from univariate statistics to multivariate relationships, providing a foundation for ongoing data quality assessment in the production environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.DescriptiveStatistics:monitoring_data\",\n", + " inputs={\n", + " \"dataset\": vm_monitoring_ds,\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.MissingValues:monitoring_data\",\n", + " inputs={\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + " params={\n", + " \"min_threshold\": 1\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.UniqueRows:monitoring_data\",\n", + " inputs={\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + " params={\n", + " \"min_percent_threshold\": 1\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.TabularNumericalHistograms:monitoring_data\",\n", + " inputs={\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.PearsonCorrelationMatrix:monitoring_data\",\n", + " inputs={\n", + " \"dataset\": vm_monitoring_ds,\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.HighPearsonCorrelation:monitoring_data\",\n", + " inputs={\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + " params={\n", + " \"feature_columns\": vm_monitoring_ds.feature_columns,\n", + " \"max_threshold\": 0.5,\n", + " \"top_n_correlations\": 10\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ClassImbalanceDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 1\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Target and feature drift\n", + "\n", + "Next, the goal is to investigate the distributional characteristics of predictions and features to determine if the underlying data has changed. These tests are crucial for assessing the expected accuracy of the model.\n", + "\n", + "1. **Target drift:** We compare the dataset used for testing (reference data) with the monitoring data. This helps to identify any shifts in the target variable distribution.\n", + "2. **Feature drift:** We compare the training dataset with the monitoring data. Since features were used to train the model, any drift in these features could indicate potential issues, as the underlying patterns that the model was trained on may have changed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we can examine the correlation between features and predictions. Significant changes in these correlations may trigger a deeper assessment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.PopulationStabilityIndex\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.TargetPredictionDistributionPlot\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 5\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we want see difference in correlation pairs between model prediction and features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.PredictionCorrelation\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 5\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally for target drift, let's plot each prediction value and feature grid side by side." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.PredictionQuantilesAcrossFeatures\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's add run a test to investigate how or if the features have drifted. In this instance we want to compare the training data with prediction data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.FeatureDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"psi_threshold\": 0.2,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Classification accuracy\n", + "\n", + "We now evaluate the model's predictive performance by comparing its behavior between reference and monitoring datasets. These tests analyze shifts in overall accuracy metrics, examine changes in the confusion matrix to identify specific classification pattern changes, and assess the model's probability calibration across different prediction thresholds. \n", + "\n", + "The primary objective is to detect any degradation in the model's classification performance that might indicate reliability issues in production. The tests provide both aggregate performance metrics and detailed breakdowns of prediction patterns, enabling the identification of specific areas where the model's accuracy might be deteriorating." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ClassificationAccuracyDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 5,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ConfusionMatrixDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 5,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.CalibrationCurveDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"n_bins\": 10,\n", + " \"drift_pct_threshold\": 10,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Class discrimination\n", + "\n", + "The following tests assess the model's ability to effectively separate different classes in both reference and monitoring datasets. These tests analyze the model's discriminative power by examining the separation between class distributions, evaluating changes in the ROC curve characteristics, comparing probability distribution patterns, and assessing cumulative prediction trends. \n", + "\n", + "The primary objective is to identify any deterioration in the model's ability to distinguish between classes, which could indicate a decline in model effectiveness. The tests examine both the overall discriminative capability and the granular patterns in prediction distributions, providing insights into whether the model maintains its ability to effectively differentiate between classes in the production environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ClassDiscriminationDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 5,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ROCCurveDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.PredictionProbabilitiesHistogramDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 10,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.CumulativePredictionProbabilitiesDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scoring\n", + "\n", + "Next we analyze the distribution and stability of credit scores across reference and monitoring datasets. These tests evaluate shifts in score distributions, examine changes in score band populations, and assess the relationship between scores and default rates. \n", + "\n", + "The primary objective is to identify any significant changes in how the model assigns credit scores, which could indicate drift in risk assessment capabilities. The tests examine both the overall score distribution patterns and the specific performance within defined score bands, providing insights into whether the model maintains consistent and reliable risk segmentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ScorecardHistogramDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " },\n", + " params={\n", + " \"score_column\": \"xgb_scores\",\n", + " \"drift_pct_threshold\": 20,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ScoreBandsDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"score_column\": \"xgb_scores\",\n", + " \"score_bands\": [500, 540, 570],\n", + " \"drift_pct_threshold\": 20,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model insights" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.PermutationFeatureImportance\",\n", + " input_grid={\n", + " \"dataset\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": [vm_xgb_model]\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.FeaturesAUC\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model],\n", + " \"dataset\": [vm_reference_ds, vm_monitoring_ds],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.SHAPGlobalImportance\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model],\n", + " \"dataset\": [vm_reference_ds, vm_monitoring_ds],\n", + " },\n", + " params={\n", + " \"kernel_explainer_samples\": 10,\n", + " \"tree_or_linear_explainer_samples\": 200,\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Diagnostic monitoring" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.WeakspotsDiagnosis\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.OverfitDiagnosis\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " },\n", + " params={\n", + " \"cut_off_threshold\": 0.04\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Robustness monitoring" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.RobustnessDiagnosis\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"scaling_factor_std_dev_list\": [\n", + " 0.1,\n", + " 0.2,\n", + " 0.3,\n", + " 0.4,\n", + " 0.5\n", + " ],\n", + " \"performance_decay_threshold\": 0.05\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance history\n", + "\n", + "In this section we showcase how to track and visualize the temporal evolution of key model performance metrics, including AUC, F1 score, precision, recall, and accuracy. For demonstration purposes, the section simulates historical performance data by introducing a gradual downward trend and random noise to these metrics over a specified time period. These tests are useful for analyzing the stability and trends in model performance indicators, helping to identify potential degradation or unexpected fluctuations in model behavior over time. \n", + "\n", + "The main goal is to maintain a continuous record of model performance that can be used to detect gradual drift, sudden changes, or cyclical patterns in model effectiveness. This temporal monitoring approach provides early warning signals of potential issues and helps establish whether the model maintains consistent performance within acceptable boundaries throughout its deployment period." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics = [metric for metric in list_metrics() if \"classification\" in metric]\n", + "\n", + "for metric_id in metrics:\n", + " describe_metric(metric_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_metric(\n", + " \"validmind.unit_metrics.classification.ROC_AUC\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + ")\n", + "auc = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_metric(\n", + " \"validmind.unit_metrics.classification.Accuracy\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + ")\n", + "accuracy = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_metric(\n", + " \"validmind.unit_metrics.classification.Recall\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + ")\n", + "recall = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f1 = run_metric(\n", + " \"validmind.unit_metrics.classification.F1\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + ")\n", + "f1 = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "precision = run_metric(\n", + " \"validmind.unit_metrics.classification.Precision\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + ")\n", + "precision = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "NUM_DAYS = 10\n", + "REFERENCE_DATE = datetime(2024, 1, 1) # Fixed date: January 1st, 2024\n", + "base_date = REFERENCE_DATE - timedelta(days=NUM_DAYS)\n", + "\n", + "\n", + "# Initial values\n", + "performance_metrics = {\n", + " \"AUC Score\": auc,\n", + " \"F1 Score\": f1,\n", + " \"Precision Score\": precision,\n", + " \"Recall Score\": recall,\n", + " \"Accuracy Score\": accuracy\n", + "}\n", + "\n", + "# Trend parameters\n", + "trend_factor = 0.98 # Slight downward trend (multiply by 0.98 each step)\n", + "noise_scale = 0.02 # Random fluctuation of ±2%\n", + "\n", + "\n", + "for i in range(NUM_DAYS):\n", + " recorded_at = base_date + timedelta(days=i)\n", + " print(f\"\\nrecorded_at: {recorded_at}\")\n", + "\n", + " # Log each metric with trend and noise\n", + " for metric_name, base_value in performance_metrics.items():\n", + " # Apply trend and add random noise\n", + " trend = base_value * (trend_factor ** i)\n", + " noise = np.random.normal(0, noise_scale * base_value)\n", + " value = max(0, min(1, trend + noise)) # Ensure value stays between 0 and 1\n", + " \n", + " log_metric(\n", + " key=metric_name,\n", + " value=value,\n", + " recorded_at=recorded_at.isoformat()\n", + " )\n", + " \n", + " print(f\"{metric_name:<15}: {value:.4f}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-eEL8LtKG-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/how_to/log_metrics_over_time.ipynb b/site/notebooks/how_to/log_metrics_over_time.ipynb new file mode 100644 index 0000000000..9cef4c5402 --- /dev/null +++ b/site/notebooks/how_to/log_metrics_over_time.ipynb @@ -0,0 +1,720 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Log metrics over time\n", + "\n", + "Learn how to track and visualize the temporal evolution of key model performance metrics with ValidMind.\n", + "\n", + "- Key model performance metrics such as AUC, F1 score, precision, recall, and accuracy, are useful for analyzing the stability and trends in model performance indicators, helping to identify potential degradation or unexpected fluctuations in model behavior over time.\n", + "- By monitoring these metrics systematically, teams can detect early warning signs of model drift and take proactive measures to maintain model reliability.\n", + "- Unit metrics in ValidMind provide a standardized way to compute and track individual performance measures, making it easy to monitor specific aspects of model behavior.\n", + "\n", + "Log metrics over time with the ValidMind Library's [`log_metric()`](https://docs.validmind.ai/validmind/validmind.html#log_metric) function and visualize them in your documentation using the *Metric Over Time* block within the ValidMind Platform. This integration enables seamless tracking of model performance, supporting custom thresholds and facilitating the automation of alerts based on logged metrics.\n", + "\n", + "
Metrics over time are most commonly associated with the continued monitoring of a model's performance once it is deployed.\n", + "

\n", + "While you are able to add Metric Over Time blocks to model documentation, we recommend first enabling ongoing monitoring for your model to maximize the potential of your performance data.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1_) \n", + " - [Before you begin](#toc1_1_) \n", + " - [New to ValidMind?](#toc1_2_) \n", + " - [Key concepts](#toc1_3_) \n", + "- [Install the ValidMind Library](#toc2_) \n", + "- [Initialize the ValidMind Library](#toc3_) \n", + " - [Get your code snippet](#toc3_1_) \n", + "- [Initialize the Python environment](#toc4_) \n", + "- [Load demo model](#toc5_) \n", + "- [Log metrics](#toc6_) \n", + " - [Run unit metrics](#toc6_1_) \n", + " - [Log unit metrics over time](#toc6_2_) \n", + " - [Pass thresholds](#toc6_3_) \n", + " - [Log multiple metrics with custom thresholds](#toc6_4_) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## About ValidMind\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language.\n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "If you haven't already seen our [Get started with the ValidMind Library](https://docs.validmind.ai/developer/get-started-validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "\n", + "
For access to all features available in this notebook, create a free ValidMind account.\n", + "

\n", + "Signing up is FREE — Register with ValidMind
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: The [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Initialize the ValidMind Library\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "### Get your code snippet\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Model Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + " For example, to register a model for use with this notebook, select:\n", + "\n", + " - Documentation template: `Credit Risk Scorecard`\n", + " - Use case: `Credit Risk - CECL`\n", + "\n", + " You can fill in other options according to your preference.\n", + "\n", + "4. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + " monitoring = True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Initialize the Python environment\n", + "\n", + "Next, let's import the necessary libraries and set up your Python environment for data analysis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "import numpy as np\n", + "\n", + "from datetime import datetime, timedelta\n", + "\n", + "from validmind.unit_metrics import list_metrics, describe_metric, run_metric\n", + "from validmind.api_client import log_metric\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Load demo model\n", + "\n", + "We'll use a classification model trained on customer churn data to demonstrate ValidMind's metric logging capabilities.\n", + "\n", + "- We'll employ a built-in classification dataset, process it through train-validation-test splits, and train an XGBoost classifier.\n", + "- The trained model and datasets are then initialized in ValidMind's framework, enabling us to track and monitor various performance metrics in the following sections." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the sample dataset from the library\n", + "\n", + "from validmind.datasets.classification import customer_churn\n", + "\n", + "print(\n", + " f\"Loaded demo dataset with: \\n\\n\\t• Target column: '{customer_churn.target_column}' \\n\\t• Class labels: {customer_churn.class_labels}\"\n", + ")\n", + "\n", + "raw_df = customer_churn.load_data()\n", + "raw_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df, validation_df, test_df = customer_churn.preprocess(raw_df)\n", + "\n", + "x_train = train_df.drop(customer_churn.target_column, axis=1)\n", + "y_train = train_df[customer_churn.target_column]\n", + "x_val = validation_df.drop(customer_churn.target_column, axis=1)\n", + "y_val = validation_df[customer_churn.target_column]\n", + "\n", + "model = xgb.XGBClassifier(early_stopping_rounds=10)\n", + "model.set_params(\n", + " eval_metric=[\"error\", \"logloss\", \"auc\"],\n", + ")\n", + "model.fit(\n", + " x_train,\n", + " y_train,\n", + " eval_set=[(x_val, y_val)],\n", + " verbose=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once the datasets and model are prepared for validation, let's initialize the ValidMind `dataset` and `model`, specifying features and targets columns.\n", + "\n", + "- The property `input_id` allows users to uniquely identify each dataset and model.\n", + "- This allows for the creation of multiple versions of datasets and models, enabling us to compute metrics by specifying which versions we want to use as inputs." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "vm_raw_dataset = vm.init_dataset(\n", + " dataset=raw_df,\n", + " input_id=\"raw_dataset\",\n", + " target_column=customer_churn.target_column,\n", + " class_labels=customer_churn.class_labels,\n", + ")\n", + "\n", + "vm_train_ds = vm.init_dataset(\n", + " dataset=train_df,\n", + " input_id=\"train_dataset\",\n", + " target_column=customer_churn.target_column,\n", + ")\n", + "\n", + "vm_test_ds = vm.init_dataset(\n", + " dataset=test_df, input_id=\"test_dataset\", target_column=customer_churn.target_column\n", + ")\n", + "\n", + "vm_model = vm.init_model(\n", + " model,\n", + " input_id=\"model\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now use the `assign_predictions()` method from the Dataset object to link existing predictions to any model. \n", + "\n", + "If no prediction values are passed, the method will compute predictions automatically:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds.assign_predictions(\n", + " model=vm_model,\n", + ")\n", + "\n", + "vm_test_ds.assign_predictions(\n", + " model=vm_model,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Log metrics\n", + "\n", + "Next, we'll use ValidMind to track the temporal evolution of key model performance metrics.\n", + "\n", + "We'll set appropriate thresholds for each metric, enable automated alerting when performance drifts beyond acceptable boundaries, and demonstrate how these thresholds can be customized based on business requirements and risk tolerance levels." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics = [metric for metric in list_metrics() if \"classification\" in metric]\n", + "\n", + "for metric_id in metrics:\n", + " describe_metric(metric_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run unit metrics\n", + "\n", + "Compute individual metrics using ValidMind's *unit metrics* — single-value metrics that can be computed on a dataset and model. Use the `run_metric()` function from the `validmind.unit_metrics` module to calculate these metrics.\n", + "\n", + "The `run_metric()` function has a signature similar to `run_test()` from the `validmind.tests` module, but is specifically designed for unit metrics and takes the following arguments:\n", + "\n", + "- **`metric_id`:** The unique identifier for the metric (for example, `validmind.unit_metrics.classification.ROC_AUC`)\n", + "- **`inputs`:** A dictionary containing the input dataset and model or their respective input IDs\n", + "- **`params`:** A dictionary containing keyword arguments for the unit metric (optional, accepts any `kwargs` from the underlying sklearn implementation)\n", + "\n", + "`run_metric()` returns and displays a result object similar to a regular ValidMind test, but only shows the unit metric value. While this result object has a `.log()` method for logging to the ValidMind Platform, in this use case we'll use unit metrics to compute performance metrics and then log them over time using the `log_metric()` function from the `validmind.api_client` module." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_metric(\n", + " \"validmind.unit_metrics.classification.ROC_AUC\",\n", + " inputs={\n", + " \"model\": vm_model,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ")\n", + "auc = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_metric(\n", + " \"validmind.unit_metrics.classification.Accuracy\",\n", + " inputs={\n", + " \"model\": vm_model,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ")\n", + "accuracy = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_metric(\n", + " \"validmind.unit_metrics.classification.Recall\",\n", + " inputs={\n", + " \"model\": vm_model,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ")\n", + "recall = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f1 = run_metric(\n", + " \"validmind.unit_metrics.classification.F1\",\n", + " inputs={\n", + " \"model\": vm_model,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ")\n", + "f1 = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "precision = run_metric(\n", + " \"validmind.unit_metrics.classification.Precision\",\n", + " inputs={\n", + " \"model\": vm_model,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ")\n", + "precision = result.metric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Log unit metrics over time\n", + "\n", + "Using the `log_metric()` function from the `validmind.api_client` module, let's log the unit metrics over time. This function takes the following arguments:\n", + "\n", + "- **`key`:** The name of the metric to log\n", + "- **`value`:** The value of the metric to log\n", + "- **`recorded_at`:** The timestamp of the metric to log — useful for logging historic predictions\n", + "- **`thresholds`:** A dictionary containing the thresholds for the metric to log\n", + "- **`params`:** A dictionary containing the keyword arguments for the unit metric (in this case, none are required, but we can pass any `kwargs` that the underlying sklearn implementation accepts)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "log_metric(\n", + " key=\"AUC Score\",\n", + " value=auc,\n", + " # If `recorded_at` is not included, the time at function run is logged\n", + " recorded_at=datetime(2024, 1, 1), \n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To visualize the logged metric, we'll use the **[Metrics Over Time block](https://docs.validmind.ai/guide/monitoring/work-with-metrics-over-time.html)** in the ValidMind Platform:\n", + "\n", + "- After adding this visualization block to your documentation or ongoing monitoring report (as shown in the image below), you'll be able to review your logged metrics plotted over time.\n", + "- In this example, since we've only logged a single data point, the visualization shows just one measurement.\n", + "- As you continue logging metrics, the graph will populate with more points, enabling you to track trends and patterns.\n", + "\n", + "![Metric Over Time block](../images/add_metric_over_time_block.png)\n", + "![AUC Score](../images/log_metric_auc_1.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Pass thresholds\n", + "\n", + "We can pass *thresholds* to the `log_metric()` function to enhance the metric over time: \n", + "\n", + "- This is useful for visualizing the metric over time and identifying potential issues. \n", + "- The metric visualization component provides a dynamic way to monitor and contextualize metric values through customizable thresholds. \n", + "- These thresholds appear as horizontal reference lines on the chart. \n", + "- The system always displays the most recent threshold configuration, meaning that if you update threshold values in your client application, the visualization will reflect these changes immediately. \n", + "\n", + "When a metric is logged without thresholds or with an empty threshold dictionary, the reference lines gracefully disappear from the chart, though the metric line itself remains visible. \n", + "\n", + "Thresholds are highly flexible in their implementation. You can define them with any meaningful key names (such as `low_risk`, `maximum`, `target`, or `acceptable_range`) in your metric data, and the visualization will adapt accordingly. " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "log_metric(\n", + " key=\"AUC Score\",\n", + " value=auc,\n", + " recorded_at=datetime(2024, 1, 1),\n", + " thresholds={\n", + " \"min_auc\": 0.7,\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![AUC Score](../images/log_metric_auc_2.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "log_metric(\n", + " key=\"AUC Score\",\n", + " value=auc,\n", + " recorded_at=datetime(2024, 1, 1),\n", + " thresholds={\n", + " \"high_risk\": 0.6,\n", + " \"medium_risk\": 0.7,\n", + " \"low_risk\": 0.8,\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![AUC Score](../images/log_metric_auc_3.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Log multiple metrics with custom thresholds\n", + "\n", + "The following code snippet shows an example of how to set up and log multiple performance metrics with custom thresholds for each metric:\n", + "\n", + "- Using AUC, F1, Precision, Recall, and Accuracy scores as examples, it demonstrates how to define different risk levels (high, medium, low) appropriate for each metric's expected range.\n", + "- The code simulates 10 days of metric history by applying a gradual decay and random noise to help visualize how metrics might drift over time in a production environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "NUM_DAYS = 10\n", + "REFERENCE_DATE = datetime(2024, 1, 1) # Fixed date: January 1st, 2024\n", + "base_date = REFERENCE_DATE - timedelta(days=NUM_DAYS)\n", + "\n", + "# Initial values with their specific thresholds\n", + "performance_metrics = {\n", + " \"AUC Score\": {\n", + " \"value\": auc,\n", + " \"thresholds\": {\n", + " \"high_risk\": 0.7,\n", + " \"medium_risk\": 0.8,\n", + " \"low_risk\": 0.9,\n", + " }\n", + " },\n", + " \"F1 Score\": {\n", + " \"value\": f1,\n", + " \"thresholds\": {\n", + " \"high_risk\": 0.5,\n", + " \"medium_risk\": 0.6,\n", + " \"low_risk\": 0.7,\n", + " }\n", + " },\n", + " \"Precision Score\": {\n", + " \"value\": precision,\n", + " \"thresholds\": {\n", + " \"high_risk\": 0.6,\n", + " \"medium_risk\": 0.7,\n", + " \"low_risk\": 0.8,\n", + " }\n", + " },\n", + " \"Recall Score\": {\n", + " \"value\": recall,\n", + " \"thresholds\": {\n", + " \"high_risk\": 0.4,\n", + " \"medium_risk\": 0.5,\n", + " \"low_risk\": 0.6,\n", + " }\n", + " },\n", + " \"Accuracy Score\": {\n", + " \"value\": accuracy,\n", + " \"thresholds\": {\n", + " \"high_risk\": 0.75,\n", + " \"medium_risk\": 0.8,\n", + " \"low_risk\": 0.85,\n", + " }\n", + " }\n", + "}\n", + "\n", + "# Trend parameters\n", + "trend_factor = 0.98 # Slight downward trend\n", + "noise_scale = 0.02 # Random fluctuation of ±2%\n", + "\n", + "for i in range(NUM_DAYS):\n", + " recorded_at = base_date + timedelta(days=i)\n", + " print(f\"\\nrecorded_at: {recorded_at}\")\n", + "\n", + " # Log each metric with trend and noise\n", + " for metric_name, metric_info in performance_metrics.items():\n", + " base_value = metric_info[\"value\"]\n", + " thresholds = metric_info[\"thresholds\"]\n", + " \n", + " # Apply trend and add random noise\n", + " trend = base_value * (trend_factor ** i)\n", + " noise = np.random.normal(0, noise_scale * base_value)\n", + " value = max(0, min(1, trend + noise)) # Ensure value stays between 0 and 1\n", + " \n", + " log_metric(\n", + " key=metric_name,\n", + " value=value,\n", + " recorded_at=recorded_at.isoformat(),\n", + " thresholds=thresholds\n", + " )\n", + " \n", + " print(f\"{metric_name:<15}: {value:.4f} (Thresholds: {thresholds})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![AUC Score](../images/log_metric_auc_4.png)\n", + "![Accuracy Score](../images/log_metric_accuracy.png)\n", + "![Precision Score](../images/log_metric_precision.png)\n", + "![Recall Score](../images/log_metric_recall.png)\n", + "![F1 Score](../images/log_metric_f1.png)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ValidMind Library", + "language": "python", + "name": "validmind" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/images/add_metric_over_time_block.png b/site/notebooks/images/add_metric_over_time_block.png new file mode 100644 index 0000000000..5ddaa84faa Binary files /dev/null and b/site/notebooks/images/add_metric_over_time_block.png differ diff --git a/site/notebooks/images/log_metric_accuracy.png b/site/notebooks/images/log_metric_accuracy.png new file mode 100644 index 0000000000..6d47a55c89 Binary files /dev/null and b/site/notebooks/images/log_metric_accuracy.png differ diff --git a/site/notebooks/images/log_metric_auc_1.png b/site/notebooks/images/log_metric_auc_1.png new file mode 100644 index 0000000000..767da49a51 Binary files /dev/null and b/site/notebooks/images/log_metric_auc_1.png differ diff --git a/site/notebooks/images/log_metric_auc_2.png b/site/notebooks/images/log_metric_auc_2.png new file mode 100644 index 0000000000..8b79b09451 Binary files /dev/null and b/site/notebooks/images/log_metric_auc_2.png differ diff --git a/site/notebooks/images/log_metric_auc_3.png b/site/notebooks/images/log_metric_auc_3.png new file mode 100644 index 0000000000..84fa26ffc7 Binary files /dev/null and b/site/notebooks/images/log_metric_auc_3.png differ diff --git a/site/notebooks/images/log_metric_auc_4.png b/site/notebooks/images/log_metric_auc_4.png new file mode 100644 index 0000000000..aa1fa53265 Binary files /dev/null and b/site/notebooks/images/log_metric_auc_4.png differ diff --git a/site/notebooks/images/log_metric_f1.png b/site/notebooks/images/log_metric_f1.png new file mode 100644 index 0000000000..9e08241395 Binary files /dev/null and b/site/notebooks/images/log_metric_f1.png differ diff --git a/site/notebooks/images/log_metric_precision.png b/site/notebooks/images/log_metric_precision.png new file mode 100644 index 0000000000..946484c03b Binary files /dev/null and b/site/notebooks/images/log_metric_precision.png differ diff --git a/site/notebooks/images/log_metric_recall.png b/site/notebooks/images/log_metric_recall.png new file mode 100644 index 0000000000..1cd24a95ea Binary files /dev/null and b/site/notebooks/images/log_metric_recall.png differ diff --git a/site/python-docs.zip b/site/python-docs.zip index 54900bc708..2c32f416d5 100644 Binary files a/site/python-docs.zip and b/site/python-docs.zip differ diff --git a/site/tests/ongoing_monitoring/CalibrationCurveDrift.md b/site/tests/ongoing_monitoring/CalibrationCurveDrift.md new file mode 100644 index 0000000000..3fa615e8df --- /dev/null +++ b/site/tests/ongoing_monitoring/CalibrationCurveDrift.md @@ -0,0 +1,46 @@ +# CalibrationCurveDrift + +Evaluates changes in probability calibration between reference and monitoring datasets. + +### Purpose + +The Calibration Curve Drift test is designed to assess changes in the model's probability calibration +over time. By comparing calibration curves between reference and monitoring datasets, this test helps +identify whether the model's probability estimates remain reliable in production. This is crucial for +understanding if the model's risk predictions maintain their intended interpretation and whether +recalibration might be necessary. + +### Test Mechanism + +This test proceeds by generating calibration curves for both reference and monitoring datasets. For each +dataset, it bins the predicted probabilities and calculates the actual fraction of positives within each +bin. It then compares these values between datasets to identify significant shifts in calibration. +The test quantifies drift as percentage changes in both mean predicted probabilities and actual fractions +of positives per bin, providing both visual and numerical assessments of calibration stability. + +### Signs of High Risk + +- Large differences between reference and monitoring calibration curves +- Systematic over-estimation or under-estimation in monitoring dataset +- Significant drift percentages exceeding the threshold in multiple bins +- Changes in calibration concentrated in specific probability ranges +- Inconsistent drift patterns across the probability spectrum +- Empty or sparse bins indicating insufficient data for reliable comparison + +### Strengths + +- Provides visual and quantitative assessment of calibration changes +- Identifies specific probability ranges where calibration has shifted +- Enables early detection of systematic prediction biases +- Includes detailed bin-by-bin comparison of calibration metrics +- Handles edge cases with insufficient data in certain bins +- Supports both binary and probabilistic interpretation of results + +### Limitations + +- Requires sufficient data in each probability bin for reliable comparison +- Sensitive to choice of number of bins and binning strategy +- May not capture complex changes in probability distributions +- Cannot directly suggest recalibration parameters +- Limited to assessing probability calibration aspects +- Results may be affected by class imbalance changes \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ClassDiscriminationDrift.md b/site/tests/ongoing_monitoring/ClassDiscriminationDrift.md new file mode 100644 index 0000000000..6a81a93961 --- /dev/null +++ b/site/tests/ongoing_monitoring/ClassDiscriminationDrift.md @@ -0,0 +1,46 @@ +# ClassDiscriminationDrift + +Compares classification discrimination metrics between reference and monitoring datasets. + +### Purpose + +The Class Discrimination Drift test is designed to evaluate changes in the model's discriminative power +over time. By comparing key discrimination metrics between reference and monitoring datasets, this test +helps identify whether the model maintains its ability to separate classes in production. This is crucial +for understanding if the model's predictive power remains stable and whether its decision boundaries +continue to effectively distinguish between different classes. + +### Test Mechanism + +This test proceeds by calculating three key discrimination metrics for both reference and monitoring +datasets: ROC AUC (Area Under the Curve), GINI coefficient, and KS (Kolmogorov-Smirnov) statistic. +For binary classification, it computes all three metrics. For multiclass problems, it focuses on +macro-averaged ROC AUC. The test quantifies drift as percentage changes in these metrics between +datasets, providing a comprehensive assessment of discrimination stability. + +### Signs of High Risk + +- Large drifts in discrimination metrics exceeding the threshold +- Significant drops in ROC AUC indicating reduced ranking ability +- Decreased GINI coefficients showing diminished separation power +- Reduced KS statistics suggesting weaker class distinction +- Inconsistent changes across different metrics +- Systematic degradation in discriminative performance + +### Strengths + +- Combines multiple complementary discrimination metrics +- Handles both binary and multiclass classification +- Provides clear quantitative drift assessment +- Enables early detection of model degradation +- Includes standardized drift threshold evaluation +- Supports comprehensive performance monitoring + +### Limitations + +- Does not identify root causes of discrimination drift +- May be sensitive to changes in class distribution +- Cannot suggest optimal decision threshold adjustments +- Limited to discrimination aspects of performance +- Requires sufficient data for reliable metric calculation +- May not capture subtle changes in decision boundaries \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ClassImbalanceDrift.md b/site/tests/ongoing_monitoring/ClassImbalanceDrift.md new file mode 100644 index 0000000000..9523239d03 --- /dev/null +++ b/site/tests/ongoing_monitoring/ClassImbalanceDrift.md @@ -0,0 +1,46 @@ +# ClassImbalanceDrift + +Evaluates drift in class distribution between reference and monitoring datasets. + +### Purpose + +The Class Imbalance Drift test is designed to detect changes in the distribution of target classes +over time. By comparing class proportions between reference and monitoring datasets, this test helps +identify whether the population structure remains stable in production. This is crucial for +understanding if the model continues to operate under similar class distribution assumptions and +whether retraining might be necessary due to significant shifts in class balance. + +### Test Mechanism + +This test proceeds by calculating class percentages for both reference and monitoring datasets. +It computes the proportion of each class and quantifies drift as the percentage difference in these +proportions between datasets. The test provides both visual and numerical comparisons of class +distributions, with special attention to changes that exceed the specified drift threshold. +Population stability is assessed on a class-by-class basis. + +### Signs of High Risk + +- Large shifts in class proportions exceeding the threshold +- Systematic changes affecting multiple classes +- Appearance of new classes or disappearance of existing ones +- Significant changes in minority class representation +- Reversal of majority-minority class relationships +- Unexpected changes in class ratios + +### Strengths + +- Provides clear visualization of distribution changes +- Identifies specific classes experiencing drift +- Enables early detection of population shifts +- Includes standardized drift threshold evaluation +- Supports both binary and multiclass problems +- Maintains interpretable percentage-based metrics + +### Limitations + +- Does not account for feature distribution changes +- Cannot identify root causes of class drift +- May be sensitive to small sample sizes +- Limited to target variable distribution only +- Requires sufficient samples per class +- May not capture subtle distribution changes \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ClassificationAccuracyDrift.md b/site/tests/ongoing_monitoring/ClassificationAccuracyDrift.md new file mode 100644 index 0000000000..a416c5d178 --- /dev/null +++ b/site/tests/ongoing_monitoring/ClassificationAccuracyDrift.md @@ -0,0 +1,46 @@ +# ClassificationAccuracyDrift + +Compares classification accuracy metrics between reference and monitoring datasets. + +### Purpose + +The Classification Accuracy Drift test is designed to evaluate changes in the model's predictive accuracy +over time. By comparing key accuracy metrics between reference and monitoring datasets, this test helps +identify whether the model maintains its performance levels in production. This is crucial for +understanding if the model's predictions remain reliable and whether its overall effectiveness has +degraded significantly. + +### Test Mechanism + +This test proceeds by calculating comprehensive accuracy metrics for both reference and monitoring +datasets. It computes overall accuracy, per-label precision, recall, and F1 scores, as well as +macro-averaged metrics. The test quantifies drift as percentage changes in these metrics between +datasets, providing both granular and aggregate views of accuracy changes. Special attention is paid +to per-label performance to identify class-specific degradation. + +### Signs of High Risk + +- Large drifts in accuracy metrics exceeding the threshold +- Inconsistent changes across different labels +- Significant drops in macro-averaged metrics +- Systematic degradation in specific class performance +- Unexpected improvements suggesting data quality issues +- Divergent trends between precision and recall + +### Strengths + +- Provides comprehensive accuracy assessment +- Identifies class-specific performance changes +- Enables early detection of model degradation +- Includes both micro and macro perspectives +- Supports multi-class classification evaluation +- Maintains interpretable drift thresholds + +### Limitations + +- May be sensitive to class distribution changes +- Does not account for prediction confidence +- Cannot identify root causes of accuracy drift +- Limited to accuracy-based metrics only +- Requires sufficient samples per class +- May not capture subtle performance changes \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ConfusionMatrixDrift.md b/site/tests/ongoing_monitoring/ConfusionMatrixDrift.md new file mode 100644 index 0000000000..cc38f3e5d1 --- /dev/null +++ b/site/tests/ongoing_monitoring/ConfusionMatrixDrift.md @@ -0,0 +1,46 @@ +# ConfusionMatrixDrift + +Compares confusion matrix metrics between reference and monitoring datasets. + +### Purpose + +The Confusion Matrix Drift test is designed to evaluate changes in the model's error patterns +over time. By comparing confusion matrix elements between reference and monitoring datasets, this +test helps identify whether the model maintains consistent prediction behavior in production. This +is crucial for understanding if the model's error patterns have shifted and whether specific types +of misclassifications have become more prevalent. + +### Test Mechanism + +This test proceeds by generating confusion matrices for both reference and monitoring datasets. +For binary classification, it tracks True Positives, True Negatives, False Positives, and False +Negatives as percentages of total predictions. For multiclass problems, it analyzes per-class +metrics including true positives and error rates. The test quantifies drift as percentage changes +in these metrics between datasets, providing detailed insight into shifting prediction patterns. + +### Signs of High Risk + +- Large drifts in confusion matrix elements exceeding threshold +- Systematic changes in false positive or false negative rates +- Inconsistent changes across different classes +- Significant shifts in error patterns for specific classes +- Unexpected improvements in certain metrics +- Divergent trends between different types of errors + +### Strengths + +- Provides detailed analysis of prediction behavior +- Identifies specific types of prediction changes +- Enables early detection of systematic errors +- Includes comprehensive error pattern analysis +- Supports both binary and multiclass problems +- Maintains interpretable percentage-based metrics + +### Limitations + +- May be sensitive to class distribution changes +- Cannot identify root causes of prediction drift +- Requires sufficient samples for reliable comparison +- Limited to hard predictions (not probabilities) +- May not capture subtle changes in decision boundaries +- Complex interpretation for multiclass problems \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.md b/site/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.md new file mode 100644 index 0000000000..415bb204cf --- /dev/null +++ b/site/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.md @@ -0,0 +1,47 @@ +# CumulativePredictionProbabilitiesDrift + +Compares cumulative prediction probability distributions between reference and monitoring datasets. + +### Purpose + +The Cumulative Prediction Probabilities Drift test is designed to evaluate changes in the model's +probability predictions over time. By comparing cumulative distribution functions of predicted +probabilities between reference and monitoring datasets, this test helps identify whether the +model's probability assignments remain stable in production. This is crucial for understanding if +the model's risk assessment behavior has shifted and whether its probability calibration remains +consistent. + +### Test Mechanism + +This test proceeds by generating cumulative distribution functions (CDFs) of predicted probabilities +for both reference and monitoring datasets. For each class, it plots the cumulative proportion of +predictions against probability values, enabling direct comparison of probability distributions. +The test visualizes both the CDFs and their differences, providing insight into how probability +assignments have shifted across the entire probability range. + +### Signs of High Risk + +- Large gaps between reference and monitoring CDFs +- Systematic shifts in probability assignments +- Concentration of differences in specific probability ranges +- Changes in the shape of probability distributions +- Unexpected patterns in cumulative differences +- Significant shifts in probability thresholds + +### Strengths + +- Provides comprehensive view of probability changes +- Identifies specific probability ranges with drift +- Enables visualization of distribution differences +- Supports analysis across multiple classes +- Maintains interpretable probability scale +- Captures subtle changes in probability assignments + +### Limitations + +- Does not provide single drift metric +- May be complex to interpret for multiple classes +- Cannot suggest probability recalibration +- Requires visual inspection for assessment +- Sensitive to sample size differences +- May not capture class-specific calibration issues \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.md b/site/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.md new file mode 100644 index 0000000000..21c585141b --- /dev/null +++ b/site/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.md @@ -0,0 +1,47 @@ +# PredictionProbabilitiesHistogramDrift + +Compares prediction probability distributions between reference and monitoring datasets. + +### Purpose + +The Prediction Probabilities Histogram Drift test is designed to evaluate changes in the model's +probability predictions over time. By comparing probability distributions between reference and +monitoring datasets using histograms, this test helps identify whether the model's probability +assignments have shifted in production. This is crucial for understanding if the model's risk +assessment behavior remains consistent and whether its probability estimates maintain their +original distribution patterns. + +### Test Mechanism + +This test proceeds by generating histograms of prediction probabilities for both reference and +monitoring datasets. For each class, it analyzes the distribution shape, central tendency, and +spread of probabilities. The test computes distribution moments (mean, variance, skewness, +kurtosis) and quantifies their drift between datasets. Visual comparison of overlaid histograms +provides immediate insight into distribution changes. + +### Signs of High Risk + +- Significant shifts in probability distribution shapes +- Large drifts in distribution moments exceeding threshold +- Appearance of new modes or peaks in monitoring data +- Changes in the spread or concentration of probabilities +- Systematic shifts in probability assignments +- Unexpected changes in distribution characteristics + +### Strengths + +- Provides intuitive visualization of probability changes +- Identifies specific changes in distribution shape +- Enables quantitative assessment of distribution drift +- Supports analysis across multiple classes +- Includes comprehensive moment analysis +- Maintains interpretable probability scale + +### Limitations + +- May be sensitive to binning choices +- Requires sufficient samples for reliable histograms +- Cannot suggest probability recalibration +- Complex interpretation for multiple classes +- May not capture subtle distribution changes +- Limited to univariate probability analysis \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.md b/site/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.md new file mode 100644 index 0000000000..36bd5ff060 --- /dev/null +++ b/site/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.md @@ -0,0 +1,34 @@ +# PredictionQuantilesAcrossFeatures + +Assesses differences in model prediction distributions across individual features between reference +and monitoring datasets through quantile analysis. + +### Purpose + +This test aims to visualize how prediction distributions vary across feature values by showing +quantile information between reference and monitoring datasets. It helps identify significant +shifts in prediction patterns and potential areas of model instability. + +### Test Mechanism + +The test generates box plots for each feature, comparing prediction probability distributions +between the reference and monitoring datasets. Each plot consists of two subplots showing the +quantile distribution of predictions: one for reference data and one for monitoring data. + +### Signs of High Risk + +- Significant differences in prediction distributions between reference and monitoring data +- Unexpected shifts in prediction quantiles across feature values +- Large changes in prediction variability between datasets + +### Strengths + +- Provides clear visualization of prediction distribution changes +- Shows outliers and variability in predictions across features +- Enables quick identification of problematic feature ranges + +### Limitations + +- May not capture complex relationships between features and predictions +- Quantile analysis may smooth over important individual predictions +- Requires careful interpretation of distribution changes \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ROCCurveDrift.md b/site/tests/ongoing_monitoring/ROCCurveDrift.md new file mode 100644 index 0000000000..8c556f72b6 --- /dev/null +++ b/site/tests/ongoing_monitoring/ROCCurveDrift.md @@ -0,0 +1,47 @@ +# ROCCurveDrift + +Compares ROC curves between reference and monitoring datasets. + +### Purpose + +The ROC Curve Drift test is designed to evaluate changes in the model's discriminative ability +over time. By comparing Receiver Operating Characteristic (ROC) curves between reference and +monitoring datasets, this test helps identify whether the model maintains its ability to +distinguish between classes across different decision thresholds. This is crucial for +understanding if the model's trade-off between sensitivity and specificity remains stable +in production. + +### Test Mechanism + +This test proceeds by generating ROC curves for both reference and monitoring datasets. For each +dataset, it plots the True Positive Rate against the False Positive Rate across all possible +classification thresholds. The test also computes AUC scores and visualizes the difference +between ROC curves, providing both graphical and numerical assessments of discrimination +stability. Special attention is paid to regions where curves diverge significantly. + +### Signs of High Risk + +- Large differences between reference and monitoring ROC curves +- Significant drop in AUC score for monitoring dataset +- Systematic differences in specific FPR regions +- Changes in optimal operating points +- Inconsistent performance across different thresholds +- Unexpected crossovers between curves + +### Strengths + +- Provides comprehensive view of discriminative ability +- Identifies specific threshold ranges with drift +- Enables visualization of performance differences +- Includes AUC comparison for overall assessment +- Supports threshold-independent evaluation +- Maintains interpretable performance metrics + +### Limitations + +- Limited to binary classification problems +- May be sensitive to class distribution changes +- Cannot suggest optimal threshold adjustments +- Requires visual inspection for detailed analysis +- Complex interpretation of curve differences +- May not capture subtle performance changes \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ScoreBandsDrift.md b/site/tests/ongoing_monitoring/ScoreBandsDrift.md new file mode 100644 index 0000000000..3f25439f8d --- /dev/null +++ b/site/tests/ongoing_monitoring/ScoreBandsDrift.md @@ -0,0 +1,48 @@ +# ScoreBandsDrift + +Analyzes drift in population distribution and default rates across score bands. + +### Purpose + +The Score Bands Drift test is designed to evaluate changes in score-based risk segmentation +over time. By comparing population distribution and default rates across score bands between +reference and monitoring datasets, this test helps identify whether the model's risk +stratification remains stable in production. This is crucial for understanding if the model's +scoring behavior maintains its intended risk separation and whether specific score ranges +have experienced significant shifts. + +### Test Mechanism + +This test proceeds by segmenting scores into predefined bands and analyzing three key metrics +across these bands: population distribution, predicted default rates, and observed default +rates. For each band, it computes these metrics for both reference and monitoring datasets +and quantifies drift as percentage changes. The test provides both detailed band-by-band +comparisons and overall stability assessment, with special attention to bands showing +significant drift. + +### Signs of High Risk + +- Large shifts in population distribution across bands +- Significant changes in default rates within bands +- Inconsistent drift patterns between adjacent bands +- Divergence between predicted and observed rates +- Systematic shifts in risk concentration +- Empty or sparse score bands in monitoring data + +### Strengths + +- Provides comprehensive view of score-based drift +- Identifies specific score ranges with instability +- Enables comparison of multiple risk metrics +- Includes both distribution and performance drift +- Supports business-relevant score segmentation +- Maintains interpretable drift thresholds + +### Limitations + +- Sensitive to choice of score band boundaries +- Requires sufficient samples in each band +- Cannot suggest optimal band adjustments +- May not capture within-band distribution changes +- Limited to predefined scoring metrics +- Complex interpretation with multiple drift signals \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ScorecardHistogramDrift.md b/site/tests/ongoing_monitoring/ScorecardHistogramDrift.md new file mode 100644 index 0000000000..95522724d1 --- /dev/null +++ b/site/tests/ongoing_monitoring/ScorecardHistogramDrift.md @@ -0,0 +1,47 @@ +# ScorecardHistogramDrift + +Compares score distributions between reference and monitoring datasets for each class. + +### Purpose + +The Scorecard Histogram Drift test is designed to evaluate changes in the model's scoring +patterns over time. By comparing score distributions between reference and monitoring datasets +for each class, this test helps identify whether the model's scoring behavior remains stable +in production. This is crucial for understanding if the model's risk assessment maintains +consistent patterns and whether specific score ranges have experienced significant shifts +in their distribution. + +### Test Mechanism + +This test proceeds by generating histograms of scores for each class in both reference and +monitoring datasets. It analyzes distribution characteristics through multiple statistical +moments: mean, variance, skewness, and kurtosis. The test quantifies drift as percentage +changes in these moments between datasets, providing both visual and numerical assessments +of distribution stability. Special attention is paid to class-specific distribution changes. + +### Signs of High Risk + +- Significant shifts in score distribution shapes +- Large drifts in distribution moments exceeding threshold +- Changes in the relative positioning of class distributions +- Appearance of new modes or peaks in monitoring data +- Unexpected changes in score spread or concentration +- Systematic shifts in class-specific scoring patterns + +### Strengths + +- Provides class-specific distribution analysis +- Identifies detailed changes in scoring patterns +- Enables visual comparison of distributions +- Includes comprehensive moment analysis +- Supports multiple class evaluation +- Maintains interpretable score scale + +### Limitations + +- Sensitive to binning choices in visualization +- Requires sufficient samples per class +- Cannot suggest score adjustments +- May not capture subtle distribution changes +- Complex interpretation with multiple classes +- Limited to univariate score analysis \ No newline at end of file