diff --git a/.github/actions/demo-notebook/action.yml b/.github/actions/demo-notebook/action.yml index bb7914a727..8e014230b8 100644 --- a/.github/actions/demo-notebook/action.yml +++ b/.github/actions/demo-notebook/action.yml @@ -6,9 +6,9 @@ inputs: description: "Load the created .env file" required: true -runs: +runs: using: "composite" - steps: + steps: - name: Install python3 for Jupyter Notebooks shell: bash run: | @@ -18,10 +18,11 @@ runs: - name: Install validmind for notebook execution shell: bash run: | - pip install validmind - pip install validmind[llm] - pip install fairlearn aequitas + pip install validmind + pip install validmind[llm] + pip install fairlearn aequitas pip install shap==0.44.1 + pip install anywidget - name: Ensure .env file is available shell: bash @@ -36,9 +37,9 @@ runs: shell: bash if: ${{ steps.find_env.outcome == 'success' }} run: | - cd site + cd site source ../${{ inputs.env_file }} - quarto render --profile exe-demo notebooks/tutorials/intro_for_model_developers_EXECUTED.ipynb &> render_errors.log || { + quarto render --profile exe-demo notebooks/tutorials/intro_for_model_developers_EXECUTED.ipynb &> render_errors.log || { echo "Execute for intro_for_model_developers_EXECUTED.ipynb failed"; cat render_errors.log; exit 1; diff --git a/.github/actions/prod-notebook/action.yml b/.github/actions/prod-notebook/action.yml index fc17da0dda..ce8612da46 100644 --- a/.github/actions/prod-notebook/action.yml +++ b/.github/actions/prod-notebook/action.yml @@ -6,9 +6,9 @@ inputs: description: "Load the created .env file" required: true -runs: +runs: using: "composite" - steps: + steps: - name: Install python3 for Jupyter Notebooks shell: bash run: | @@ -18,10 +18,11 @@ runs: - name: Install validmind for notebook execution shell: bash run: | - pip install validmind - pip install validmind[llm] - pip install fairlearn aequitas + pip install validmind + pip install validmind[llm] + pip install fairlearn aequitas pip install shap==0.44.1 + pip install anywidget - name: Ensure .env file is available shell: bash @@ -36,9 +37,9 @@ runs: shell: bash if: ${{ steps.find_env.outcome == 'success' }} run: | - cd site + cd site source ../${{ inputs.env_file }} - quarto render --profile exe-prod notebooks/tutorials/intro_for_model_developers_EXECUTED.ipynb &> render_errors.log || { + quarto render --profile exe-prod notebooks/tutorials/intro_for_model_developers_EXECUTED.ipynb &> render_errors.log || { echo "Execute for intro_for_model_developers_EXECUTED.ipynb failed"; cat render_errors.log; exit 1; diff --git a/.github/actions/staging-notebook/action.yml b/.github/actions/staging-notebook/action.yml index 4dfb84506c..f53d395380 100644 --- a/.github/actions/staging-notebook/action.yml +++ b/.github/actions/staging-notebook/action.yml @@ -6,9 +6,9 @@ inputs: description: "Load the created .env file" required: true -runs: +runs: using: "composite" - steps: + steps: - name: Install python3 for Jupyter Notebooks shell: bash run: | @@ -18,10 +18,11 @@ runs: - name: Install validmind for notebook execution shell: bash run: | - pip install validmind - pip install validmind[llm] - pip install fairlearn aequitas + pip install validmind + pip install validmind[llm] + pip install fairlearn aequitas pip install shap==0.44.1 + pip install anywidget - name: Ensure .env file is available shell: bash @@ -36,9 +37,9 @@ runs: shell: bash if: ${{ steps.find_env.outcome == 'success' }} run: | - cd site + cd site source ../${{ inputs.env_file }} - quarto render --profile exe-staging notebooks/tutorials/intro_for_model_developers_EXECUTED.ipynb &> render_errors.log || { + quarto render --profile exe-staging notebooks/tutorials/intro_for_model_developers_EXECUTED.ipynb &> render_errors.log || { echo "Execute for intro_for_model_developers_EXECUTED.ipynb failed"; cat render_errors.log; exit 1; diff --git a/.github/workflows/deploy-docs-prod.yaml b/.github/workflows/deploy-docs-prod.yaml index 5378cb7a05..8948ec862a 100644 --- a/.github/workflows/deploy-docs-prod.yaml +++ b/.github/workflows/deploy-docs-prod.yaml @@ -28,8 +28,8 @@ jobs: - name: Render prod docs site run: | - cd site - quarto render --profile production &> render_errors.log || { + cd site + quarto render --profile production &> render_errors.log || { echo "Quarto render failed immediately"; cat render_errors.log; exit 1; @@ -39,11 +39,11 @@ jobs: id: create_env run: | touch .env - echo VM_API_HOST=${{ secrets.PLATFORM_API_HOST }} >> .env - echo VM_API_KEY=${{ secrets.PLATFORM_API_KEY }} >> .env - echo VM_API_SECRET=${{ secrets.PLATFORM_API_SECRET }} >> .env - echo VM_API_MODEL=${{ secrets.PLATFORM_DEV_MODEL }} >> .env - cat .env + echo VM_API_HOST=${{ secrets.PLATFORM_API_HOST }} >> .env + echo VM_API_KEY=${{ secrets.PLATFORM_API_KEY }} >> .env + echo VM_API_SECRET=${{ secrets.PLATFORM_API_SECRET }} >> .env + echo VM_API_MODEL=${{ secrets.PLATFORM_DEV_MODEL }} >> .env + cat .env # Only execute the prod notebook if .env file is created - name: Execute prod Intro for Model Developers notebook diff --git a/.github/workflows/deploy-docs-staging.yaml b/.github/workflows/deploy-docs-staging.yaml index f45a06353c..b08982f28c 100644 --- a/.github/workflows/deploy-docs-staging.yaml +++ b/.github/workflows/deploy-docs-staging.yaml @@ -28,8 +28,8 @@ jobs: - name: Render staging docs site run: | - cd site - quarto render --profile staging &> render_errors.log || { + cd site + quarto render --profile staging &> render_errors.log || { echo "Quarto render failed immediately"; cat render_errors.log; exit 1; @@ -39,11 +39,11 @@ jobs: id: create_env run: | touch .env - echo VM_API_HOST=${{ secrets.PLATFORM_API_HOST }} >> .env - echo VM_API_KEY=${{ secrets.PLATFORM_API_KEY }} >> .env - echo VM_API_SECRET=${{ secrets.PLATFORM_API_SECRET }} >> .env - echo VM_API_MODEL=${{ secrets.PLATFORM_DEV_MODEL }} >> .env - cat .env + echo VM_API_HOST=${{ secrets.PLATFORM_API_HOST }} >> .env + echo VM_API_KEY=${{ secrets.PLATFORM_API_KEY }} >> .env + echo VM_API_SECRET=${{ secrets.PLATFORM_API_SECRET }} >> .env + echo VM_API_MODEL=${{ secrets.PLATFORM_DEV_MODEL }} >> .env + cat .env # Only execute the staging notebook if .env file is created - name: Execute staging Intro for Model Developers notebook diff --git a/.github/workflows/validate-docs-site.yaml b/.github/workflows/validate-docs-site.yaml index 8cea51499a..bba96bf603 100644 --- a/.github/workflows/validate-docs-site.yaml +++ b/.github/workflows/validate-docs-site.yaml @@ -27,8 +27,8 @@ jobs: - name: Render demo docs site run: | - cd site - quarto render --profile development &> render_errors.log || { + cd site + quarto render --profile development &> render_errors.log || { echo "Quarto render failed immediately"; cat render_errors.log; exit 1; @@ -52,21 +52,21 @@ jobs: id: create_env run: | touch .env - echo VM_API_HOST=${{ secrets.PLATFORM_API_HOST }} >> .env - echo VM_API_KEY=${{ secrets.PLATFORM_API_KEY }} >> .env - echo VM_API_SECRET=${{ secrets.PLATFORM_API_SECRET }} >> .env - echo VM_API_MODEL=${{ secrets.PLATFORM_DEV_MODEL }} >> .env - cat .env + echo VM_API_HOST=${{ secrets.PLATFORM_API_HOST }} >> .env + echo VM_API_KEY=${{ secrets.PLATFORM_API_KEY }} >> .env + echo VM_API_SECRET=${{ secrets.PLATFORM_API_SECRET }} >> .env + echo VM_API_MODEL=${{ secrets.PLATFORM_DEV_MODEL }} >> .env + cat .env # Only execute the demo notebook if .env file is created - name: Execute demo Intro for Model Developers notebook - if: ${{ steps.create_env.outcome == 'success' }} + if: ${{ env.ENABLE_DEMO_NOTEBOOK == 'true' && steps.create_env.outcome == 'success' }} uses: ./.github/actions/demo-notebook id: execute-demo-notebook with: env_file: .env - - name: Test for warnings or errors + - name: Test for warnings or errors run: | if grep -q 'WARN:\|ERROR:' site/render_errors.log; then echo "Warnings or errors detected during Quarto render" @@ -76,7 +76,7 @@ jobs: echo "No warnings or errors detected during Quarto render" fi - # Demo bucket is in us-east-1 + # Demo bucket is in us-east-1 - name: Configure AWS credentials run: aws configure set aws_access_key_id ${{ secrets.AWS_ACCESS_KEY_ID }} && aws configure set aws_secret_access_key ${{ secrets.AWS_SECRET_ACCESS_KEY }} && aws configure set default.region us-east-1 diff --git a/README.md b/README.md index c4d9eed536..c377e6bb91 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,8 @@ You need: ## How to contribute -First, read through and familiarize yourself with our [ValidMind style guide](https://docs.validmind.ai/about/contributing/style-guide/style-guide.html). +> [!IMPORTANT] +> First, read through and familiarize yourself with our [ValidMind style guide](https://docs.validmind.ai/about/contributing/style-guide/style-guide.html). - Our core user guides are sourced in Quarto Markdown under [`site/guide`](https://github.com/validmind/documentation/tree/main/site/guide). - If you create new documentation, make sure to add it to the [`_quarto.yml`](https://github.com/validmind/documentation/blob/main/site/_quarto.yml) file. @@ -133,7 +134,8 @@ make get-source After you pull in the changes, commit them to this repo as part of the release notes process. -**Want to author new code samples?** Refer to our [Jupyter Notebook template Quickstart](https://github.com/validmind/validmind-library/tree/main/notebooks/templates)! +> [!TIP] +> **Want to author new code samples?** Refer to our [Jupyter Notebook template Quickstart](https://github.com/validmind/validmind-library/tree/main/notebooks/templates)! diff --git a/site/_quarto.yml b/site/_quarto.yml index 04e111ed8d..84c238b7d7 100644 --- a/site/_quarto.yml +++ b/site/_quarto.yml @@ -170,6 +170,7 @@ website: - text: "---" - text: "Releases" # MAKE-RELEASE-NOTES-EMBED-MARKER + - releases/2025/2025-jan-31/release-notes.qmd # CURRENT-YEAR-END-MARKER - file: releases/2024/2024-releases.qmd contents: diff --git a/site/developer/model-testing/testing-overview.qmd b/site/developer/model-testing/testing-overview.qmd index 1ac300ea0f..99666a993f 100644 --- a/site/developer/model-testing/testing-overview.qmd +++ b/site/developer/model-testing/testing-overview.qmd @@ -51,6 +51,9 @@ listing: contents: - ../../notebooks/code_samples/custom_tests/implement_custom_tests.ipynb - ../../notebooks/code_samples/custom_tests/integrate_external_test_providers.ipynb + - path: /notebooks/how_to/add_context_to_llm_descriptions.ipynb + title: "Add context to LLM-generated test descriptions" + description: "Learn how to add custom use case and test-specific context to your LLM-generated test descriptions." --- {{< var vm.product >}} provides many built-in tests and test suites, which help you produce documentation during stages of the model development lifecycle where you need to validate that your work satisfies MRM (model risk management) requirements. diff --git a/site/faq/faq-testing.qmd b/site/faq/faq-testing.qmd index ad8ae129f0..5f1e0db840 100644 --- a/site/faq/faq-testing.qmd +++ b/site/faq/faq-testing.qmd @@ -38,6 +38,11 @@ Yes, {{< var vm.product >}} allows tests to be manipulated at several levels: - You can change the thresholds and parameters for default tests already available in the {{< var vm.developer >}} — for instance, changing the threshold parameter for the class imbalance flag.[^5] - You can also connect your own custom tests with the {{< var validmind.developer >}}. These custom tests are configurable and are able to run programmatically, just like the rest of the {{< var vm.developer >}}.[^6] +::: {.callout} +In addition to custom tests, you can also add use case and test-specific context for any test to enhance the LLM-generated test descriptions using the {{< var validmind.developer >}}.[^7] + +::: + {{< include _faq-explainability.qmd >}} {{< include _faq-synthetic-datasets.qmd >}} @@ -62,4 +67,6 @@ Yes, {{< var vm.product >}} allows tests to be manipulated at several levels: [^5]: [`ClassImbalance()`](/validmind/validmind/tests/data_validation/ClassImbalance.html#ClassImbalance) -[^6]: [Can I use my own tests?](/developer/model-testing/testing-overview.qmd#can-i-use-my-own-tests) \ No newline at end of file +[^6]: [Can I use my own tests?](/developer/model-testing/testing-overview.qmd#can-i-use-my-own-tests) + +[^7]: [Add context to LLM-generated test descriptions](/notebooks/how_to/add_context_to_llm_descriptions.ipynb) \ No newline at end of file diff --git a/site/guide/model-documentation/equation-preview.png b/site/guide/model-documentation/equation-preview.png new file mode 100644 index 0000000000..f488480048 Binary files /dev/null and b/site/guide/model-documentation/equation-preview.png differ diff --git a/site/guide/model-documentation/export-documentation.qmd b/site/guide/model-documentation/export-documentation.qmd index bbf02dcf3b..aedd395c1a 100644 --- a/site/guide/model-documentation/export-documentation.qmd +++ b/site/guide/model-documentation/export-documentation.qmd @@ -7,6 +7,11 @@ aliases: Export your model documentation or validation reports as Microsoft Word files (`.docx`) for use outside of the {{< var validmind.platform >}}. +::: {.callout} +{{< var vm.product >}} supports Word 365, Word 2019, Word 2016, and Word 2013. +::: + + ::: {.attn} ## Prerequisites @@ -16,10 +21,6 @@ Export your model documentation or validation reports as Microsoft Word files (` - [x] Model documentation is completed or in progress.[^2] - [x] You are a [{{< fa code >}} Developer]{.bubble} or [{{< fa circle-check >}} Validator]{.bubble}, or assigned another role with sufficient permissions to perform the tasks in this guide.[^3] -::: {.callout} -{{< var vm.product >}} supports Word 365, Word 2019, Word 2016, and Word 2013. -::: - ::: ## Export model documentation @@ -32,13 +33,7 @@ Export your model documentation or validation reports as Microsoft Word files (` 4. In right sidebar, click **{{< fa download >}} Export Document**. -5. Configure the export options: - - - - Choose the file format for export. We currently support exporting to `.docx` for Microsoft Word format. - -6. Click **{{< fa file-arrow-down >}} Download File** to download the file locally on your machine. +7. Click **{{< fa file-arrow-down >}} Download File** to download the file locally on your machine. ## Export validation report @@ -46,16 +41,15 @@ Export your model documentation or validation reports as Microsoft Word files (` 2. Select a model or find your model by applying a filter or searching for it.[^5] - 3. In the left sidebar that appears for your model, click **{{< fa shield >}} Validation Report**. 4. In right sidebar, click **{{< fa download >}} Export Document**. -5. Configure the export options: +5. Configure what is exported in your document by checking off the relevant boxes: - - - Choose the file format for export. We currently support exporting to `.docx` for Microsoft Word format. + - Include compliance summary[^6] + - Include validation guidelines information[^7] + - Include validation guideline adherence details 6. Click **{{< fa file-arrow-down >}} Download File** to download the file locally on your machine. @@ -80,3 +74,6 @@ Export your model documentation or validation reports as Microsoft Word files (` [^5]: [Working with the model inventory](/guide/model-inventory/working-with-model-inventory.qmd#search-filter-and-sort-models) +[^6]: [Assess compliance](/guide/model-validation/assess-compliance.qmd) + +[^7]: [Manage validation guidelines](/guide/model-validation/manage-validation-guidelines.qmd) diff --git a/site/guide/model-documentation/formula-display-modes.png b/site/guide/model-documentation/formula-display-modes.png new file mode 100644 index 0000000000..965dc59699 Binary files /dev/null and b/site/guide/model-documentation/formula-display-modes.png differ diff --git a/site/guide/model-documentation/mathtype-full-demo.gif b/site/guide/model-documentation/mathtype-full-demo.gif deleted file mode 100644 index f4c0eddcf3..0000000000 Binary files a/site/guide/model-documentation/mathtype-full-demo.gif and /dev/null differ diff --git a/site/guide/model-documentation/work-with-content-blocks.qmd b/site/guide/model-documentation/work-with-content-blocks.qmd index c95ba3851a..613061dc4f 100644 --- a/site/guide/model-documentation/work-with-content-blocks.qmd +++ b/site/guide/model-documentation/work-with-content-blocks.qmd @@ -63,18 +63,31 @@ Use {{< var vm.product >}} to assist you with generating content via AI!^[[Gener ### Add mathematical formulas -While editing a simple text block, you can insert math equations using the editor: +While editing a simple text block, you can insert math equations using the formula editor: -1. Click **√** in the toolbar while editing a content block. +1. Click **$f(x)$** in the toolbar while editing a content block. -2. You can use the interface to type out the equation, or paste in a LaTeX formula: +2. Enter your LaTeX formula in the text box and confirm that the Equation preview generates as expected: -![Inserting a mathematical formula using LaTex within a simple text block](mathtype-full-demo.gif){width=100% fig-alt="An animation that shows how to insert a mathematical formula using LaTex within a simple text block" .screenshot width=90%} + - To insert the equation inline, leave **Display mode** untoggled. + - To insert the equation on its own line, toggle **Display mode** on. -3. Click **Insert** to add the equation to your content block. +::: {.column-margin} +![Equation preview in the formula editor](equation-preview.png){fig-alt="A screenshot showing the equation preview in the formula editor" .screenshot group="latex"} + +::: + + ![The two formula display modes](formula-display-modes.png){fig-alt="A screenshot showing the two formula display modes" .screenshot group="latex"} + +3. Click **[{{< fa check >}}]{.green}** to add the equation to your content block. ### Generate Text with AI [beta]{.smallcaps} +::: {.callout title="Have you logged your tests?"} +Generating content drafts for your model documentation works best after you've logged tests and test descriptions for your model's tests with the {{< var validmind.developer >}},[^8] as existing test descriptions provide more context for the large language model (LLM) to draw upon. + +::: + While editing a simple text block, you can have {{< var vm.product >}} assist you with generating content drafts: 1. Click **{{< fa diamond >}} [beta]{.smallcaps} (Generate Text with AI)** in the toolbar while editing a content block. @@ -84,19 +97,18 @@ While editing a simple text block, you can have {{< var vm.product >}} assist yo 3. Review the draft composed by the {{< var vm.product >}} AI Content Builder for accuracy and relevance, then: - Click **{{< fa download >}} Accept Text** to insert the draft into your content block. - - Click **{{< fa rotate >}} Try Again** to regenerate a different draft. + - Click **{{< fa rotate >}} Try Again** to regenerate a different draft.g - Click **Cancel** to discard the draft and return to your documentation section. 4. After you insert the AI-generated draft, click on the text box to make the necessary edits and adjustments to your copy: - Ensure that content is in compliance with the quality guidelines outlined by your organization. - - Use the content editing toolbar[^8] just as you would with any other text block. + - Use the content editing toolbar[^9] just as you would with any other text block. ![Generating content with AI within a simple text block](generate-with-ai.gif){width=90% fig-alt="An animation that showcases the Generate with AI feature within a simple text block" .screenshot} -::: {.callout} -When generating content drafts with AI, accepted versions and edits are retained in your {{< fa wifi >}} Model Activity[^9] just like other updates to your documentation, reports, or plans. -::: +When generating content drafts with AI, accepted versions and edits are retained in your {{< fa wifi >}} Model Activity[^10] just like other updates to your documentation, reports, or plans. + ## Remove content blocks @@ -106,7 +118,7 @@ Test-driven or metric over time blocks can be re-added later on but **text block 1. In the left sidebar, click **{{< fa cubes >}} Inventory**. -2. Select a model or find your model by applying a filter or searching for it.[^10] +2. Select a model or find your model by applying a filter or searching for it.[^11] 3. In the left sidebar that appears for your model, click **{{< fa book-open >}} Documentation**, **{{< fa shield >}} Validation Report**, or **{{< fa desktop >}} Ongoing Monitoring**. @@ -151,8 +163,10 @@ Test-driven or metric over time blocks can be re-added later on but **text block [^7]: [Collaborate with others](/guide/model-documentation/collaborate-with-others.qmd) -[^8]: [Content editing toolbar](#content-editing-toolbar) +[^8]: [Run tests and test suites](/developer/model-testing/testing-overview.qmd) + +[^9]: [Content editing toolbar](#content-editing-toolbar) -[^9]: [View model activity](/guide/model-inventory/view-model-activity.qmd) +[^10]: [View model activity](/guide/model-inventory/view-model-activity.qmd) -[^10]: [Working with the model inventory](/guide/model-inventory/working-with-model-inventory.qmd#search-filter-and-sort-models) \ No newline at end of file +[^11]: [Working with the model inventory](/guide/model-inventory/working-with-model-inventory.qmd#search-filter-and-sort-models) \ No newline at end of file diff --git a/site/guide/monitoring/example-f1-score.png b/site/guide/monitoring/example-f1-score.png new file mode 100644 index 0000000000..f81b1a13be Binary files /dev/null and b/site/guide/monitoring/example-f1-score.png differ diff --git a/site/guide/monitoring/metric-over-time-data.png b/site/guide/monitoring/metric-over-time-data.png index 4c3a264673..5b5bdbfad1 100644 Binary files a/site/guide/monitoring/metric-over-time-data.png and b/site/guide/monitoring/metric-over-time-data.png differ diff --git a/site/guide/monitoring/work-with-metrics-over-time.qmd b/site/guide/monitoring/work-with-metrics-over-time.qmd index 4cd46e67c2..26f6038fe2 100644 --- a/site/guide/monitoring/work-with-metrics-over-time.qmd +++ b/site/guide/monitoring/work-with-metrics-over-time.qmd @@ -5,18 +5,27 @@ date: last-modified Once generated via the {{< var validmind.developer >}}, view and add metrics over time to your ongoing monitoring plans in the {{< var validmind.platform >}}. -Metrics over time refers to the continued monitoring of a model's performance once it is deployed. Tracking how a model performs as new data is introduced or conditions change ensures that it remains accurate and reliable in real-world environments where data distributions or market conditions shift. +Metrics over time refers to the continued monitoring of a model's performance once it is deployed. Tracking how a model performs as new data is introduced or conditions change ensures that it remains accurate and reliable in real-world environments where data distributions or market conditions shift. -- Model performance is determined by continuously measuring metrics and comparing them over time to detect degradation, bias, or shifts in the model's output. -- Performance data is collected and tracked over time, often using a rolling window approach or real-time monitoring tools with the same metrics used in testing, but observed across different periods. +- Model performance is determined by continuously measuring metrics and comparing them over time to detect degradation, bias, or shifts in the model's output. +- Performance data is collected and tracked over time, often using a rolling window approach or real-time monitoring tools with the same metrics used in testing, but observed across different periods. - Continuous tracking helps to identify if and when a model needs to be recalibrated, retrained, or even replaced due to performance deterioration or changing conditions. +::: {.column-margin} +::: {.callout} +## **[Log metrics over time {{< fa hand-point-right >}}](/notebooks/how_to/log_metrics_over_time.ipynb)** + +Learn how to log metrics over time, set thresholds, and analyze model performance trends with our Jupyter Notebook sample. +::: + +::: + ::: {.attn} ## Prerequisites - [x] {{< var link.login >}} -- [x] Metrics over time have already been logged via the {{< var validmind.developer >}} for your model.[^1] +- [x] Metrics over time have already been logged via the {{< var validmind.developer >}} for your model.[^1] - [x] You are a [{{< fa code >}} Developer]{.bubble} or assigned another role with sufficient permissions to perform the tasks in this guide.[^2] ::: @@ -44,7 +53,7 @@ Metrics over time refers to the continued monitoring of a model's performance on - Select the metric over time to insert into the model documentation from the list of available metrics. - Search by name using **{{}} Search** on the top-left to locate specific metrics. - ![Metric over time blocks that have been selected for insertion](metrics-over-time-menu.png){width=90% fig-alt="A screenshot showing several metric over time blocks that have been selected for insertion" .screenshot} + ![Metric Over Time blocks that have been selected for insertion](metrics-over-time-menu.png){fig-alt="A screenshot showing several Metric Over Time blocks that have been selected for insertion" .screenshot group="time-metric"} To preview what is included in a metric, click on it. By default, the actively selected metric is previewed. @@ -52,6 +61,8 @@ Metrics over time refers to the continued monitoring of a model's performance on 8. After inserting the metrics into your document, review the data to confirm that it is accurate and relevant. + ![Example F1 Score — Metric Over Time visualization](example-f1-score.png){fig-alt="A screenshot showing an example F1 Score — Metric Over Time visualization" .screenshot group="time-metric"} + ## View metric over time metadata @@ -60,6 +71,7 @@ After you have added metrics over time to your document, you can view the follow - Date and time the metric was recorded - Who updated the metric - The numeric value of the metric +- The metric's thresholds - Any additional parameters 1. In the left sidebar, click **{{< fa cubes >}} Inventory**. @@ -68,11 +80,11 @@ After you have added metrics over time to your document, you can view the follow 3. In the left sidebar that appears for your model, click **{{< fa book-open >}} Documentation** or **{{< fa desktop >}} Ongoing Monitoring**. -4. Locate the metric whose metadata you want to view. +4. Locate the metric whose metadata you want to view. -5. Under the metric's name, click on **Data** tab. +5. Under the metric's name, click on **Data** tab. - ![](metric-over-time-data.png){width=85% fig-alt="A screenshot showing the Data tab within a metric over time" .screenshot} + ![Example Data tab within a Metric Over Time](metric-over-time-data.png){fig-alt="A screenshot showing an example Data tab within a Metric Over Time" .screenshot} ## What's next @@ -85,7 +97,7 @@ After you have added metrics over time to your document, you can view the follow -[^1]: [Intro to Unit Metrics](/notebooks/how_to/run_unit_metrics.ipynb) +[^1]: [Log metrics over time](/notebooks/how_to/log_metrics_over_time.ipynb) [^2]: [Manage permissions](/guide/configuration/manage-permissions.qmd) diff --git a/site/index.qmd b/site/index.qmd index 4558c52ea3..7a6d0c5f51 100644 --- a/site/index.qmd +++ b/site/index.qmd @@ -46,12 +46,12 @@ listing: fields: [title, description] contents: # MAKE-RELEASE-NOTES-LATEST-MARKER + - /releases/2025/2025-jan-31/release-notes.qmd - /releases/2024/2024-dec-24/release-notes.qmd - /releases/2024/2024-dec-06/release-notes.qmd - /releases/2024/2024-oct-22/release-notes.qmd - /releases/2024/2024-sep-25/release-notes.qmd - /releases/2024/2024-sep-09/release-notes.qmd - - /releases/2024/2024-aug-13/release-notes.qmd # MAKE-RELEASE-NOTES-OLDEST-MARKER - id: validmind-academy type: grid diff --git a/site/notebooks.zip b/site/notebooks.zip index b971c34903..115a60ad76 100644 Binary files a/site/notebooks.zip and b/site/notebooks.zip differ diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb b/site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb new file mode 100644 index 0000000000..3ee2b1e6bb --- /dev/null +++ b/site/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb @@ -0,0 +1,293 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Document an application scorecard model\n", + "\n", + "Build and document an *application scorecard model* with the ValidMind Library by using Kaggle's [Lending Club](https://www.kaggle.com/datasets/devanshi23/loan-data-2007-2014/data) sample dataset to build a simple application scorecard.\n", + "\n", + "An application scorecard model is a type of statistical model used in credit scoring to evaluate the creditworthiness of potential borrowers by generating a score based on various characteristics of an applicant — such as credit history, income, employment status, and other relevant financial data. \n", + "\n", + "- This score helps lenders make decisions about whether to approve or reject loan applications, as well as determine the terms of the loan, including interest rates and credit limits. \n", + "- Application scorecard models enable lenders to manage risk efficiently while making the loan application process faster and more transparent for applicants.\n", + "\n", + "This interactive notebook provides a step-by-step guide for loading a demo dataset, preprocessing the raw data, training a model for testing, setting up test inputs, initializing the required ValidMind objects, running the test, and then logging the results to ValidMind." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language.\n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "If you haven't already seen our [Get started with the ValidMind Library](https://docs.validmind.ai/developer/get-started-validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "\n", + "
For access to all features available in this notebook, create a free ValidMind account.\n", + "

\n", + "Signing up is FREE — Register with ValidMind
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: The [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Initialize the ValidMind Library\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "\n", + "\n", + "### Get your code snippet\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Model Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + " For example, to register a model for use with this notebook, select:\n", + "\n", + " - Documentation template: `Credit Risk Scorecard`\n", + " - Use case: `Credit Risk - CECL`\n", + "\n", + " You can fill in other options according to your preference.\n", + "\n", + "4. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import validmind as vm\n", + "\n", + "vm.init(\n", + " api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", + " api_key = \"...\",\n", + " api_secret = \"...\",\n", + " model = \"...\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Document the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.datasets.credit_risk import lending_club\n", + "from validmind.utils import preview_test_config\n", + "\n", + "scorecard = lending_club.load_scorecard()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "lending_club.init_vm_objects(scorecard)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_config = lending_club.load_test_config(scorecard)\n", + "preview_test_config(test_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.run_documentation_tests(config=test_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps\n", + "\n", + "You can look at the results of this test suite right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation.\n", + "\n", + "\n", + "\n", + "### Work with your model documentation\n", + "\n", + "1. In the ValidMind Platform, go to the **Documentation** page for the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-documentation/working-with-model-documentation.html))\n", + "\n", + "2. Expand the following sections and take a look around:\n", + "\n", + " - **2. Data Preparation**\n", + " - **3. Model Development**\n", + "\n", + "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation (hint: some of the tests in **2.3. Feature Selection and Engineering** look like they need some attention), view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready.\n", + "\n", + "\n", + "\n", + "### Discover more learning resources\n", + "\n", + "We offer many interactive notebooks to help you document models:\n", + "\n", + "- [Run tests & test suites](https://docs.validmind.ai/developer/model-testing/testing-overview.html)\n", + "- [Code samples](https://docs.validmind.ai/developer/samples-jupyter-notebooks.html)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-eEL8LtKG-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb b/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb index 1e956cc1aa..750ebc9672 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb +++ b/site/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb @@ -605,7 +605,7 @@ "\n", " For each metric in the test results, include in the test overview:\n", " - The metric's purpose and what it measures\n", - " - Its mathematical formula in LaTeX notation\n", + " - Its mathematical formula\n", " - The range of possible values\n", " - What constitutes good/bad performance\n", " - How to interpret different values\n", diff --git a/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb b/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb index 03a6180b83..26a983f10d 100644 --- a/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb +++ b/site/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb @@ -545,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -606,7 +606,7 @@ "\n", " For each metric in the test results, include in the test overview:\n", " - The metric's purpose and what it measures\n", - " - Its mathematical formula in LaTeX notation\n", + " - Its mathematical formula\n", " - The range of possible values\n", " - What constitutes good/bad performance\n", " - How to interpret different values\n", @@ -648,15 +648,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.DatasetDescription:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.DatasetDescription:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + ").log()" ] }, { @@ -665,15 +662,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.DescriptiveStatistics:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.DescriptiveStatistics:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " }\n", + ").log()" ] }, { @@ -682,18 +676,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.MissingValues:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"min_threshold\": 1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.MissingValues:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"min_threshold\": 1\n", + " }\n", + ").log()" ] }, { @@ -702,18 +693,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.ClassImbalance:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"min_percent_threshold\": 10\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.ClassImbalance:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"min_percent_threshold\": 10\n", + " }\n", + ").log()" ] }, { @@ -722,18 +710,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.Duplicates:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"min_threshold\": 1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.Duplicates:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"min_threshold\": 1\n", + " }\n", + ").log()" ] }, { @@ -742,20 +727,17 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.HighCardinality:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"num_threshold\": 100,\n", - " \"percent_threshold\": 0.1,\n", - " \"threshold_type\": \"percent\"\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.HighCardinality:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"num_threshold\": 100,\n", + " \"percent_threshold\": 0.1,\n", + " \"threshold_type\": \"percent\"\n", + " }\n", + ").log()" ] }, { @@ -764,18 +746,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.Skewness:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"max_threshold\": 1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.Skewness:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"max_threshold\": 1\n", + " }\n", + ").log()" ] }, { @@ -784,18 +763,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.UniqueRows:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"min_percent_threshold\": 1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.UniqueRows:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"min_percent_threshold\": 1\n", + " }\n", + ").log()" ] }, { @@ -804,18 +780,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TooManyZeroValues:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"max_percent_threshold\": 0.03\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TooManyZeroValues:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"max_percent_threshold\": 0.03\n", + " }\n", + ").log()" ] }, { @@ -824,18 +797,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.IQROutliersTable:raw_data\",\n", - " inputs={\n", - " \"dataset\": vm_raw_dataset,\n", - " },\n", - " params={\n", - " \"threshold\": 5\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.IQROutliersTable:raw_data\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"threshold\": 5\n", + " }\n", + ").log()" ] }, { @@ -853,15 +823,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.DescriptiveStatistics:preprocessed_data\",\n", - " inputs={\n", - " \"dataset\": vm_preprocess_dataset,\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.DescriptiveStatistics:preprocessed_data\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset,\n", + " }\n", + ").log()" ] }, { @@ -870,15 +837,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TabularDescriptionTables:preprocessed_data\",\n", - " inputs={\n", - " \"dataset\": vm_preprocess_dataset\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TabularDescriptionTables:preprocessed_data\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset\n", + " },\n", + ").log()" ] }, { @@ -887,18 +851,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.MissingValues:preprocessed_data\",\n", - " inputs={\n", - " \"dataset\": vm_preprocess_dataset,\n", - " },\n", - " params={\n", - " \"min_threshold\": 1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.MissingValues:preprocessed_data\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset,\n", + " },\n", + " params={\n", + " \"min_threshold\": 1\n", + " }\n", + ").log()" ] }, { @@ -907,15 +868,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TabularNumericalHistograms:preprocessed_data\",\n", - " inputs={\n", - " \"dataset\": vm_preprocess_dataset\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TabularNumericalHistograms:preprocessed_data\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset\n", + " },\n", + ").log()" ] }, { @@ -924,15 +882,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TabularCategoricalBarPlots:preprocessed_data\",\n", - " inputs={\n", - " \"dataset\": vm_preprocess_dataset\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TabularCategoricalBarPlots:preprocessed_data\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset\n", + " },\n", + ").log()" ] }, { @@ -941,18 +896,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TargetRateBarPlots:preprocessed_data\",\n", - " inputs={\n", - " \"dataset\": vm_preprocess_dataset\n", - " },\n", - " params={\n", - " \"default_column\": lending_club.target_column,\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TargetRateBarPlots:preprocessed_data\",\n", + " inputs={\n", + " \"dataset\": vm_preprocess_dataset\n", + " },\n", + " params={\n", + " \"default_column\": lending_club.target_column,\n", + " },\n", + ").log()" ] }, { @@ -968,15 +920,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.DescriptiveStatistics:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.DescriptiveStatistics:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " }\n", + ").log()" ] }, { @@ -985,15 +934,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TabularDescriptionTables:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TabularDescriptionTables:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + ").log()" ] }, { @@ -1002,18 +948,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.ClassImbalance:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"min_percent_threshold\": 10\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.ClassImbalance:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"min_percent_threshold\": 10\n", + " }\n", + ").log()" ] }, { @@ -1022,18 +965,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.UniqueRows:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"min_percent_threshold\": 1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.UniqueRows:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"min_percent_threshold\": 1\n", + " }\n", + ").log()" ] }, { @@ -1042,15 +982,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.TabularNumericalHistograms:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.TabularNumericalHistograms:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + ").log()" ] }, { @@ -1066,18 +1003,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.MutualInformation:development_data\",\n", - " input_grid ={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"min_threshold\": 0.01,\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.MutualInformation:development_data\",\n", + " input_grid ={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"min_threshold\": 0.01,\n", + " },\n", + ").log()" ] }, { @@ -1086,15 +1020,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.PearsonCorrelationMatrix:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.PearsonCorrelationMatrix:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " }\n", + ").log()" ] }, { @@ -1103,19 +1034,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.HighPearsonCorrelation:development_data\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"max_threshold\": 0.3,\n", - " \"top_n_correlations\": 10\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.HighPearsonCorrelation:development_data\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"max_threshold\": 0.3,\n", + " \"top_n_correlations\": 10\n", + " }\n", + ").log()" ] }, { @@ -1124,18 +1052,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.WOEBinTable\",\n", - " input_grid={\n", - " \"dataset\": [vm_preprocess_dataset]\n", - " },\n", - " params={\n", - " \"breaks_adj\": lending_club.breaks_adj,\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.WOEBinTable\",\n", + " input_grid={\n", + " \"dataset\": [vm_preprocess_dataset]\n", + " },\n", + " params={\n", + " \"breaks_adj\": lending_club.breaks_adj,\n", + " },\n", + ").log()" ] }, { @@ -1144,18 +1069,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.WOEBinPlots\",\n", - " input_grid={\n", - " \"dataset\": [vm_preprocess_dataset]\n", - " },\n", - " params={\n", - " \"breaks_adj\": lending_club.breaks_adj,\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.WOEBinPlots\",\n", + " input_grid={\n", + " \"dataset\": [vm_preprocess_dataset]\n", + " },\n", + " params={\n", + " \"breaks_adj\": lending_club.breaks_adj,\n", + " },\n", + ").log()" ] }, { @@ -1173,15 +1095,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.data_validation.DatasetSplit\",\n", - " inputs={\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.data_validation.DatasetSplit\",\n", + " inputs={\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " },\n", + ").log()" ] }, { @@ -1190,15 +1109,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.ModelMetadata\",\n", - " input_grid={\n", - " \"model\": [vm_xgb_model, vm_rf_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.ModelMetadata\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model, vm_rf_model],\n", + " },\n", + ").log()" ] }, { @@ -1207,15 +1123,12 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.ModelParameters\",\n", - " input_grid={\n", - " \"model\": [vm_xgb_model, vm_rf_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.ModelParameters\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model, vm_rf_model],\n", + " },\n", + ").log()" ] }, { @@ -1231,16 +1144,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.statsmodels.GINITable\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model, vm_rf_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.statsmodels.GINITable\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model, vm_rf_model],\n", + " },\n", + ").log()" ] }, { @@ -1249,16 +1159,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.ClassifierPerformance\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model, vm_rf_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.ClassifierPerformance\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model, vm_rf_model],\n", + " },\n", + ").log()" ] }, { @@ -1267,19 +1174,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.TrainingTestDegradation:XGBoost\",\n", - " inputs={\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " \"model\": vm_xgb_model,\n", - " },\n", - " params={\n", - " \"max_threshold\": 0.1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.TrainingTestDegradation:XGBoost\",\n", + " inputs={\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"max_threshold\": 0.1\n", + " }\n", + ").log()" ] }, { @@ -1288,19 +1192,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.TrainingTestDegradation:RandomForest\",\n", - " inputs={\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " \"model\": vm_rf_model,\n", - " },\n", - " params={\n", - " \"max_threshold\": 0.1\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.TrainingTestDegradation:RandomForest\",\n", + " inputs={\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " \"model\": vm_rf_model,\n", + " },\n", + " params={\n", + " \"max_threshold\": 0.1\n", + " }\n", + ").log()" ] }, { @@ -1309,23 +1210,19 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " # Run the test\n", - " result = run_test(\n", - " \"validmind.model_validation.sklearn.HyperParametersTuning\",\n", - " inputs={\n", - " \"model\": vm_xgb_model,\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - " params={\n", - " \"param_grid\": {'n_estimators': [50, 100]},\n", - " \"scoring\": ['roc_auc', 'recall'],\n", - " \"fit_params\": {'eval_set': [(x_test, y_test)], 'verbose': False},\n", - " \"thresholds\": [0.3, 0.5],\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.HyperParametersTuning\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + " params={\n", + " \"param_grid\": {'n_estimators': [50, 100]},\n", + " \"scoring\": ['roc_auc', 'recall'],\n", + " \"fit_params\": {'eval_set': [(x_test, y_test)], 'verbose': False},\n", + " \"thresholds\": [0.3, 0.5],\n", + " }\n", + ").log()" ] }, { @@ -1343,16 +1240,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.ROCCurve\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.ROCCurve\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + ").log()" ] }, { @@ -1361,19 +1255,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.MinimumROCAUCScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " params={\n", - " \"min_threshold\": 0.5\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.MinimumROCAUCScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + " params={\n", + " \"min_threshold\": 0.5\n", + " }\n", + ").log()" ] }, { @@ -1382,16 +1273,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.statsmodels.PredictionProbabilitiesHistogram\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.statsmodels.PredictionProbabilitiesHistogram\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + ").log()" ] }, { @@ -1400,16 +1288,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.statsmodels.CumulativePredictionProbabilities\",\n", - " input_grid={\n", - " \"model\": [vm_xgb_model],\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.statsmodels.CumulativePredictionProbabilities\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model],\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + ").log()" ] }, { @@ -1418,20 +1303,17 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.PopulationStabilityIndex\",\n", - " inputs={\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " \"model\": vm_xgb_model,\n", - " },\n", - " params={\n", - " \"num_bins\": 10,\n", - " \"mode\": \"fixed\"\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.PopulationStabilityIndex\",\n", + " inputs={\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"num_bins\": 10,\n", + " \"mode\": \"fixed\"\n", + " }\n", + ").log()" ] }, { @@ -1447,19 +1329,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.ClassifierThresholdOptimization\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " \"model\": vm_xgb_model\n", - " },\n", - " params={\n", - " \"target_recall\": 0.8 # Find a threshold that achieves a recall of 80%\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.ClassifierThresholdOptimization\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " \"model\": vm_xgb_model\n", + " },\n", + " params={\n", + " \"target_recall\": 0.8 # Find a threshold that achieves a recall of 80%\n", + " }\n", + ").log()" ] }, { @@ -1468,16 +1347,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.CalibrationCurve\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.CalibrationCurve\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + ").log()" ] }, { @@ -1486,16 +1362,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.ConfusionMatrix\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.ConfusionMatrix\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + ").log()" ] }, { @@ -1504,19 +1377,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.MinimumAccuracy\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " params={\n", - " \"min_threshold\": 0.7\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.MinimumAccuracy\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + " params={\n", + " \"min_threshold\": 0.7\n", + " }\n", + ").log()" ] }, { @@ -1525,19 +1395,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.MinimumF1Score\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " params={\n", - " \"min_threshold\": 0.5\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.MinimumF1Score\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + " params={\n", + " \"min_threshold\": 0.5\n", + " }\n", + ").log()" ] }, { @@ -1546,16 +1413,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.PrecisionRecallCurve\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model]\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.PrecisionRecallCurve\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model]\n", + " },\n", + ").log()" ] }, { @@ -1571,16 +1435,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.WeakspotsDiagnosis\",\n", - " inputs={\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " \"model\": vm_xgb_model,\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.WeakspotsDiagnosis\",\n", + " inputs={\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + ").log()" ] }, { @@ -1589,19 +1450,16 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.OverfitDiagnosis\",\n", - " inputs={\n", - " \"model\": vm_xgb_model,\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"cut_off_threshold\": 0.04\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.OverfitDiagnosis\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"cut_off_threshold\": 0.04\n", + " }\n", + ").log()" ] }, { @@ -1610,26 +1468,23 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.RobustnessDiagnosis\",\n", - " inputs={\n", - " \"datasets\": [vm_train_ds, vm_test_ds],\n", - " \"model\": vm_xgb_model,\n", - " },\n", - " params={\n", - " \"scaling_factor_std_dev_list\": [\n", - " 0.1,\n", - " 0.2,\n", - " 0.3,\n", - " 0.4,\n", - " 0.5\n", - " ],\n", - " \"performance_decay_threshold\": 0.05\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.RobustnessDiagnosis\",\n", + " inputs={\n", + " \"datasets\": [vm_train_ds, vm_test_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"scaling_factor_std_dev_list\": [\n", + " 0.1,\n", + " 0.2,\n", + " 0.3,\n", + " 0.4,\n", + " 0.5\n", + " ],\n", + " \"performance_decay_threshold\": 0.05\n", + " }\n", + ").log()" ] }, { @@ -1647,16 +1502,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.PermutationFeatureImportance\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " \"model\": [vm_xgb_model]\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.PermutationFeatureImportance\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " \"model\": [vm_xgb_model]\n", + " }\n", + ").log()" ] }, { @@ -1665,16 +1517,13 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.FeaturesAUC\",\n", - " input_grid={\n", - " \"model\": [vm_xgb_model],\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.FeaturesAUC\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model],\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + ").log()" ] }, { @@ -1683,20 +1532,17 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.SHAPGlobalImportance\",\n", - " input_grid={\n", - " \"model\": [vm_xgb_model],\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"kernel_explainer_samples\": 10,\n", - " \"tree_or_linear_explainer_samples\": 200,\n", - " }\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.sklearn.SHAPGlobalImportance\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model],\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"kernel_explainer_samples\": 10,\n", + " \"tree_or_linear_explainer_samples\": 200,\n", + " }\n", + ").log()" ] }, { @@ -1712,18 +1558,15 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", - "\n", - " run_test(\n", - " \"validmind.model_validation.statsmodels.ScorecardHistogram\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds, vm_test_ds],\n", - " },\n", - " params={\n", - " \"score_column\": \"xgb_scores\",\n", - " },\n", - " ).log()" + "run_test(\n", + " \"validmind.model_validation.statsmodels.ScorecardHistogram\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds, vm_test_ds],\n", + " },\n", + " params={\n", + " \"score_column\": \"xgb_scores\",\n", + " },\n", + ").log()" ] }, { @@ -1732,20 +1575,115 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", + "run_test(\n", + " \"validmind.data_validation.ScoreBandDefaultRates\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + " params = {\n", + " \"score_column\": \"xgb_scores\",\n", + " \"score_bands\": [500, 540, 570]\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.ScoreProbabilityAlignment\",\n", + " input_grid={\n", + " \"dataset\": [vm_train_ds],\n", + " \"model\": [vm_xgb_model],\n", + " },\n", + " params={\n", + " \"score_column\": \"xgb_scores\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Custom tests\n", + "\n", + "Custom tests extend the functionality of ValidMind, allowing you to document any model or use case with added flexibility.\n", "\n", - " run_test(\n", - " \"validmind.data_validation.ScoreBandDefaultRates\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " params = {\n", - " \"score_column\": \"xgb_scores\",\n", - " \"score_bands\": [500, 540, 570]\n", - " }\n", - " ).log()" + "ValidMind provides a comprehensive set of tests out-of-the-box to evaluate and document your models and datasets. We recognize there will be cases where the default tests do not support a model or dataset, or specific documentation is needed. In these cases, you can create and use your own custom code to accomplish what you need. To streamline custom code integration, we support the creation of custom test functions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### In-line custom tests\n", + "\n", + "The `@vm.test` decorator is doing the work of creating a wrapper around the function that will allow it to be run by the ValidMind Library. It also registers the test so it can be found by the ID `my_custom_tests.ScoreToOdds\"`. The function `score_to_odds_analysis` takes three arguments `dataset`, `score_column`, and `score_bands`. This is a `VMDataset` and the rest are parameters that can be passed in." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "\n", + "\n", + "@vm.test(\"my_custom_tests.ScoreToOdds\")\n", + "def score_to_odds_analysis(dataset, score_column='score', score_bands=[410, 440, 470]):\n", + " \"\"\"\n", + " Analyzes the relationship between score bands and odds (good:bad ratio).\n", + " Good odds = (1 - default_rate) / default_rate\n", + " \n", + " Higher scores should correspond to higher odds of being good.\n", + " \"\"\"\n", + " df = dataset.df\n", + " \n", + " # Create score bands\n", + " df['score_band'] = pd.cut(\n", + " df[score_column],\n", + " bins=[-np.inf] + score_bands + [np.inf],\n", + " labels=[f'<{score_bands[0]}'] + \n", + " [f'{score_bands[i]}-{score_bands[i+1]}' for i in range(len(score_bands)-1)] +\n", + " [f'>{score_bands[-1]}']\n", + " )\n", + " \n", + " # Calculate metrics per band\n", + " results = df.groupby('score_band').agg({\n", + " dataset.target_column: ['mean', 'count']\n", + " })\n", + " \n", + " results.columns = ['Default Rate', 'Total']\n", + " results['Good Count'] = results['Total'] - (results['Default Rate'] * results['Total'])\n", + " results['Bad Count'] = results['Default Rate'] * results['Total']\n", + " results['Odds'] = results['Good Count'] / results['Bad Count']\n", + " \n", + " # Create visualization\n", + " fig = go.Figure()\n", + " \n", + " # Add odds bars\n", + " fig.add_trace(go.Bar(\n", + " name='Odds (Good:Bad)',\n", + " x=results.index,\n", + " y=results['Odds'],\n", + " marker_color='blue'\n", + " ))\n", + " \n", + " fig.update_layout(\n", + " title='Score-to-Odds Analysis',\n", + " yaxis=dict(title='Odds Ratio (Good:Bad)'),\n", + " showlegend=False\n", + " )\n", + " \n", + " return fig" ] }, { @@ -1754,19 +1692,71 @@ "metadata": {}, "outputs": [], "source": [ - "run=True\n", - "if run:\n", + "run_test(\n", + " \"my_custom_tests.ScoreToOdds\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " params={\n", + " \"score_column\": \"xgb_scores\",\n", + " \"score_bands\": [500, 540, 570],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Local test provider\n", + "\n", + "The ValidMind Library offers the ability to extend the built-in library of tests with custom tests. A test \"Provider\" is a Python class that gets registered with the ValidMind Library and loads tests based on a test ID, for example `my_test_provider.my_test_id`. The built-in suite of tests that ValidMind offers is technically its own test provider. You can use one the built-in test provider offered by ValidMind (`validmind.tests.test_providers.LocalTestProvider`) or you can create your own. More than likely, you'll want to use the `LocalTestProvider` to add a directory of custom tests but there's flexibility to be able to load tests from any source." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import LocalTestProvider\n", "\n", - " run_test(\n", - " \"validmind.model_validation.sklearn.ScoreProbabilityAlignment\",\n", - " input_grid={\n", - " \"dataset\": [vm_train_ds],\n", - " \"model\": [vm_xgb_model],\n", - " },\n", - " params={\n", - " \"score_column\": \"xgb_scores\",\n", - " },\n", - " ).log()" + "# Define the folder where your tests are located\n", + "tests_folder = \"custom_tests\"\n", + "\n", + "# initialize the test provider with the tests folder we created earlier\n", + "my_test_provider = LocalTestProvider(tests_folder)\n", + "\n", + "vm.tests.register_test_provider(\n", + " namespace=\"my_test_provider\",\n", + " test_provider=my_test_provider,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have our test provider set up, we can run any test that's located in our tests folder by using the `run_test()` method. This function is your entry point to running single tests in the ValidMind Library. It takes a test ID and runs the test associated with that ID. For our custom tests, the test ID will be the `namespace` specified when registering the provider, followed by the path to the test file relative to the tests folder. For example, the Confusion Matrix test we created earlier will have the test ID `my_test_provider.ConfusionMatrix`. You could organize the tests in subfolders, say `classification` and `regression`, and the test ID for the Confusion Matrix test would then be `my_test_provider.classification.ConfusionMatrix`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"my_test_provider.ScoreBandDiscriminationMetrics\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"score_column\": \"xgb_scores\",\n", + " \"score_bands\": [500, 540, 570],\n", + " }\n", + ").log(section_id=\"interpretability_insights\")" ] }, { diff --git a/site/notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py b/site/notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py new file mode 100644 index 0000000000..62127d82ec --- /dev/null +++ b/site/notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py @@ -0,0 +1,193 @@ +# Copyright © 2023-2024 ValidMind Inc. All rights reserved. +# See the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial + +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from sklearn.metrics import roc_curve, roc_auc_score +from typing import Tuple +from validmind import tags, tasks +from validmind.vm_models import VMDataset, VMModel + + +@tags("visualization", "credit_risk", "scorecard") +@tasks("classification") +def ScoreBandDiscriminationMetrics( + dataset: VMDataset, + model: VMModel, + score_column: str = "score", + score_bands: list = None, + title: str = "Score Band Discrimination Metrics", +) -> Tuple[go.Figure, pd.DataFrame]: + """ + Evaluates discrimination metrics (AUC, GINI, KS) across different score bands for credit risk assessment. + + ### Purpose + + The Score Band Discrimination Metrics test is designed to evaluate the model's discriminatory power across + different score ranges. By segmenting the score distribution into bands and calculating key discrimination + metrics within each band, this test helps identify whether the model maintains consistent performance across + the entire score spectrum. This is crucial for understanding if the model's ability to separate good and bad + accounts varies significantly across different score ranges. + + ### Test Mechanism + + This test proceeds by first segmenting the score distribution into predefined bands. For each band, it + calculates three key discrimination metrics: AUC (Area Under the Curve), GINI coefficient, and KS + (Kolmogorov-Smirnov) statistic. The AUC measures the model's ability to rank order risk, the GINI + coefficient provides a measure of inequality in the predictions, and the KS statistic quantifies the maximum + separation between cumulative distributions. The test also tracks the population distribution and default + rates across bands to provide context for the discrimination metrics. + + ### Signs of High Risk + + - Significant variations in discrimination metrics between adjacent score bands + - Very low metric values in specific score ranges, indicating poor discrimination + - Inconsistent patterns in metric values across the score spectrum + - Large disparities between band-specific metrics and overall metrics + - Unexpected relationships between default rates and discrimination metrics + - Insufficient population in certain score bands for reliable metric calculation + + ### Strengths + + - Provides a comprehensive view of model discrimination across the score spectrum + - Combines multiple complementary metrics for robust performance assessment + - Identifies specific score ranges where model performance might be suboptimal + - Includes population and default rate context for better interpretation + - Handles edge cases such as single-class bands and insufficient data + - Enables visual comparison of metrics across score bands + + ### Limitations + + - Requires sufficient data in each score band for reliable metric calculation + - May be sensitive to the choice of score band boundaries + - Does not account for business importance of different score ranges + - Metrics may be unstable in bands with very low default rates + - Cannot directly suggest optimal score band boundaries + - Limited to assessing discrimination aspects of model performance + """ + if score_column not in dataset.df.columns: + raise ValueError(f"Score column '{score_column}' not found in dataset") + + df = dataset.df.copy() + + # Default score bands if none provided + if score_bands is None: + score_bands = [410, 440, 470] + + # Create band labels + band_labels = [ + f"{score_bands[i]}-{score_bands[i+1]}" for i in range(len(score_bands) - 1) + ] + band_labels.insert(0, f"<{score_bands[0]}") + band_labels.append(f">{score_bands[-1]}") + + # Bin the scores + df["score_band"] = pd.cut( + df[score_column], bins=[-np.inf] + score_bands + [np.inf], labels=band_labels + ) + + # Calculate metrics for each band + results = [] + for band in band_labels: + band_mask = df["score_band"] == band + if band_mask.sum() > 1: # Need at least 2 samples + y_true = df[band_mask][dataset.target_column].values + y_prob = dataset.y_prob(model)[ + band_mask + ] # Get predicted probabilities using dataset method + + # Convert to float arrays + y_true = np.array(y_true, dtype=float) + y_prob = np.array(y_prob, dtype=float) + + # Calculate metrics + try: + fpr, tpr, _ = roc_curve(y_true, y_prob) + ks = max(tpr - fpr) + auc = roc_auc_score(y_true, y_prob) + gini = 2 * auc - 1 + except ValueError: # Handle cases with single class + ks, auc, gini = 0, 0.5, 0 + + results.append( + { + "Score Band": band, + "Population Count": band_mask.sum(), + "Population (%)": (band_mask.sum() / len(df)) * 100, + "AUC": auc, + "GINI": gini, + "KS": ks, + "Default Rate (%)": (y_true.mean() * 100), + } + ) + + # Calculate total metrics + y_true = df[dataset.target_column].values + y_prob = dataset.y_prob(model) # Get predicted probabilities for total calculation + + fpr, tpr, _ = roc_curve(y_true, y_prob) + total_ks = max(tpr - fpr) + total_auc = roc_auc_score(y_true, y_prob) + total_gini = 2 * total_auc - 1 + + # Add total row + results.append( + { + "Score Band": f"Total ({df[score_column].min():.0f}-{df[score_column].max():.0f})", + "Population Count": len(df), + "Population (%)": 100.0, + "AUC": total_auc, + "GINI": total_gini, + "KS": total_ks, + "Default Rate (%)": (y_true.mean() * 100), + } + ) + + results_df = pd.DataFrame(results) + + # Create visualization (excluding total) + fig = go.Figure() + + # Filter out the total row for plotting + plot_df = results_df[results_df["Score Band"].str.contains("Total") == False] + + # Add metric bars + for metric, color in [ + ("AUC", "rgb(31, 119, 180)"), + ("GINI", "rgb(255, 127, 14)"), + ("KS", "rgb(44, 160, 44)"), + ]: + fig.add_trace( + go.Bar( + name=metric, + x=plot_df["Score Band"], + y=plot_df[metric], + marker_color=color, + ) + ) + + # Add default rate line (excluding total) + fig.add_trace( + go.Scatter( + name="Default Rate (%)", + x=plot_df["Score Band"], + y=plot_df["Default Rate (%)"], + yaxis="y2", + line=dict(color="red", width=2), + ) + ) + + # Update layout + fig.update_layout( + title=title, + xaxis_title="Score Band", + yaxis_title="Discrimination Metrics", + yaxis2=dict(title="Default Rate (%)", overlaying="y", side="right"), + barmode="group", + showlegend=True, + height=600, + ) + + return fig, results_df diff --git a/site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb b/site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb new file mode 100644 index 0000000000..ab5d6d4bf6 --- /dev/null +++ b/site/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb @@ -0,0 +1,1215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ongoing Monitoring for Application Scorecard \n", + "\n", + "In this notebook, you'll learn how to seamlessly monitor your production models using the ValidMind Platform.\n", + "\n", + "We'll walk you through the process of initializing the ValidMind Library, loading a sample dataset and model, and running a monitoring test suite to quickly generate documentation about your new data and model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation, validation, monitoring tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our [Get started with the ValidMind Library](https://docs.validmind.ai/developer/get-started-validmind-library.html), we recommend you explore the available resources for developers at some point. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "\n", + "
For access to all features available in this notebook, create a free ValidMind account.\n", + "

\n", + "Signing up is FREE — Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Model monitoring documentation**: A comprehensive and structured record of a production model, including key elements such as data sources, inputs, performance metrics, and periodic evaluations. This documentation ensures transparency and visibility of the model's performance in the production environment.\n", + "\n", + "**Monitoring documentation template**: Similar to documentation template, The monitoring documentation template functions as a test suite and lays out the structure of model monitoring documentation, segmented into various sections and sub-sections. Monitoring documentation templates define the structure of your model monitoring documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize the ValidMind Library\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "\n", + "\n", + "### Get your code snippet\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Model Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + " For example, to register a model for use with this notebook, select:\n", + "\n", + " - Documentation template: `Binary classification`\n", + " - Use case: `Marketing/Sales - Attrition/Churn Management`\n", + "\n", + " You can fill in other options according to your preference.\n", + "\n", + "4. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", + " api_key = \"...\",\n", + " api_secret = \"...\",\n", + " model = \"...\",\n", + " monitoring = True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize the Python environment\n", + "\n", + "Next, let's import the necessary libraries and set up your Python environment for data analysis:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "import numpy as np\n", + "\n", + "from datetime import datetime, timedelta\n", + "\n", + "from validmind.tests import run_test\n", + "from validmind.datasets.credit_risk import lending_club\n", + "from validmind.unit_metrics import list_metrics\n", + "from validmind.unit_metrics import describe_metric\n", + "from validmind.unit_metrics import run_metric\n", + "from validmind.api_client import log_metric\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preview the monitoring template\n", + "\n", + "A template predefines sections for your model monitoring documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "You will upload documentation and test results into this template later on. For now, take a look at the structure that the template provides with the `vm.preview_template()` function from the ValidMind library and note the empty sections:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the reference and monitoring datasets\n", + "\n", + "The sample dataset used here is provided by the ValidMind library. For demonstration purposes we'll use the training, test dataset splits as `reference` and `monitoring` datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = lending_club.load_data(source=\"offline\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preprocess_df = lending_club.preprocess(df)\n", + "preprocess_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fe_df = lending_club.feature_engineering(preprocess_df)\n", + "fe_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train the model\n", + "\n", + "In this section, we focus on constructing and refining our predictive model. \n", + "- We begin by dividing our data, which is based on Weight of Evidence (WoE) features, into training and testing sets (`train_df`, `test_df`). \n", + "- With `lending_club.split`, we employ a simple random split, randomly allocating data points to each set to ensure a mix of examples in both." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Split the data\n", + "train_df, test_df = lending_club.split(fe_df, test_size=0.2)\n", + "\n", + "x_train = train_df.drop(lending_club.target_column, axis=1)\n", + "y_train = train_df[lending_club.target_column]\n", + "\n", + "x_test = test_df.drop(lending_club.target_column, axis=1)\n", + "y_test = test_df[lending_club.target_column]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the XGBoost model\n", + "xgb_model = xgb.XGBClassifier(\n", + " n_estimators=50, \n", + " random_state=42, \n", + " early_stopping_rounds=10\n", + ")\n", + "xgb_model.set_params(\n", + " eval_metric=[\"error\", \"logloss\", \"auc\"],\n", + ")\n", + "\n", + "# Fit the model\n", + "xgb_model.fit(\n", + " x_train, \n", + " y_train,\n", + " eval_set=[(x_test, y_test)],\n", + " verbose=False\n", + ")\n", + "\n", + "# Compute probabilities\n", + "train_xgb_prob = xgb_model.predict_proba(x_train)[:, 1]\n", + "test_xgb_prob = xgb_model.predict_proba(x_test)[:, 1]\n", + "\n", + "# Compute binary predictions\n", + "cut_off_threshold = 0.3\n", + "train_xgb_binary_predictions = (train_xgb_prob > cut_off_threshold).astype(int)\n", + "test_xgb_binary_predictions = (test_xgb_prob > cut_off_threshold).astype(int)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialize the ValidMind datasets\n", + "\n", + "Before you can run tests, you must first initialize a ValidMind dataset object using the [`init_dataset`](https://docs.validmind.ai/validmind/validmind.html#init_dataset) function from the ValidMind (`vm`) module.\n", + "\n", + "This function takes a number of arguments:\n", + "\n", + "- `dataset` — The raw dataset that you want to provide as input to tests.\n", + "- `input_id` - A unique identifier that allows tracking what inputs are used when running each individual test.\n", + "- `target_column` — A required argument if tests require access to true values. This is the name of the target column in the dataset.\n", + "\n", + "With all datasets ready, you can now initialize training, reference(test) and monitor datasets (`reference_df` and `monitor_df`) created earlier into their own dataset objects using [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset):" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "vm_reference_ds = vm.init_dataset(\n", + " dataset=train_df,\n", + " input_id=\"reference_dataset\",\n", + " target_column=lending_club.target_column,\n", + ")\n", + "\n", + "vm_monitoring_ds = vm.init_dataset(\n", + " dataset=test_df,\n", + " input_id=\"monitoring_dataset\",\n", + " target_column=lending_club.target_column,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialize a model object\n", + "\n", + "You will also need to initialize a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data. You simply intialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model):" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "vm_xgb_model = vm.init_model(\n", + " xgb_model,\n", + " input_id=\"xgb_model\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Assign prediction values and probabilities to the datasets\n", + "\n", + "With our model now trained, we'll move on to assigning both the predictive probabilities coming directly from the model's predictions, and the binary prediction after applying the cutoff threshold described in the previous steps. \n", + "- These tasks are achieved through the use of the `assign_predictions()` method associated with the VM `dataset` object.\n", + "- This method links the model's class prediction values and probabilities to our VM train and test datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "vm_reference_ds.assign_predictions(\n", + " model=vm_xgb_model,\n", + " prediction_values=train_xgb_binary_predictions,\n", + " prediction_probabilities=train_xgb_prob,\n", + ")\n", + "\n", + "vm_monitoring_ds.assign_predictions(\n", + " model=vm_xgb_model,\n", + " prediction_values=test_xgb_binary_predictions,\n", + " prediction_probabilities=test_xgb_prob,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compute credit risk scores\n", + "\n", + "In this phase, we translate model predictions into actionable scores using probability estimates generated by our trained model." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "train_xgb_scores = lending_club.compute_scores(train_xgb_prob)\n", + "test_xgb_scores = lending_club.compute_scores(test_xgb_prob)\n", + "\n", + "# Assign scores to the datasets\n", + "vm_reference_ds.add_extra_column(\"xgb_scores\", train_xgb_scores)\n", + "vm_monitoring_ds.add_extra_column(\"xgb_scores\", test_xgb_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Adding custom context to the LLM descriptions\n", + "\n", + "To enable the LLM descriptions context, you need to set the `VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED` environment variable to `1`. This will enable the LLM descriptions context, which will be used to provide additional context to the LLM descriptions. This is a global setting that will affect all tests." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED\"] = \"1\"\n", + "\n", + "context = \"\"\"\n", + "FORMAT FOR THE LLM DESCRIPTIONS: \n", + " **** is designed to .\n", + "\n", + " The test operates by \n", + "\n", + " The primary advantages of this test include \n", + "\n", + " Users should be aware that \n", + "\n", + " **Key Insights:**\n", + "\n", + " The test results reveal:\n", + "\n", + " - ****: \n", + " - ****: \n", + " ...\n", + "\n", + " Based on these results, \n", + "\n", + "ADDITIONAL INSTRUCTIONS:\n", + " Present insights in order from general to specific, with each insight as a single bullet point with bold title.\n", + "\n", + " For each metric in the test results, include in the test overview:\n", + " - The metric's purpose and what it measures\n", + " - Its mathematical formula\n", + " - The range of possible values\n", + " - What constitutes good/bad performance\n", + " - How to interpret different values\n", + "\n", + " Each insight should progressively cover:\n", + " 1. Overall scope and distribution\n", + " 2. Complete breakdown of all elements with specific values\n", + " 3. Natural groupings and patterns\n", + " 4. Comparative analysis between datasets/categories\n", + " 5. Stability and variations\n", + " 6. Notable relationships or dependencies\n", + "\n", + " Remember:\n", + " - Keep all insights at the same level (no sub-bullets or nested structures)\n", + " - Make each insight complete and self-contained\n", + " - Include specific numerical values and ranges\n", + " - Cover all elements in the results comprehensively\n", + " - Maintain clear, concise language\n", + " - Use only \"- **Title**: Description\" format for insights\n", + " - Progress naturally from general to specific observations\n", + "\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monitoring data description\n", + "\n", + "The Monitoring Data Description tests aim to provide a comprehensive statistical analysis of the monitoring dataset's characteristics. These tests examine the basic statistical properties, identify any missing data patterns, assess data uniqueness, visualize numerical feature distributions, and evaluate feature relationships through correlation analysis.\n", + "\n", + "The primary objective is to establish a baseline understanding of the monitoring data's structure and quality, enabling the detection of any significant deviations from expected patterns that could impact model performance. Each test is designed to capture different aspects of the data, from univariate statistics to multivariate relationships, providing a foundation for ongoing data quality assessment in the production environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.DescriptiveStatistics:monitoring_data\",\n", + " inputs={\n", + " \"dataset\": vm_monitoring_ds,\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.MissingValues:monitoring_data\",\n", + " inputs={\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + " params={\n", + " \"min_threshold\": 1\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.UniqueRows:monitoring_data\",\n", + " inputs={\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + " params={\n", + " \"min_percent_threshold\": 1\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.TabularNumericalHistograms:monitoring_data\",\n", + " inputs={\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.PearsonCorrelationMatrix:monitoring_data\",\n", + " inputs={\n", + " \"dataset\": vm_monitoring_ds,\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.HighPearsonCorrelation:monitoring_data\",\n", + " inputs={\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + " params={\n", + " \"feature_columns\": vm_monitoring_ds.feature_columns,\n", + " \"max_threshold\": 0.5,\n", + " \"top_n_correlations\": 10\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ClassImbalanceDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 1\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Target and feature drift\n", + "\n", + "Next, the goal is to investigate the distributional characteristics of predictions and features to determine if the underlying data has changed. These tests are crucial for assessing the expected accuracy of the model.\n", + "\n", + "1. **Target drift:** We compare the dataset used for testing (reference data) with the monitoring data. This helps to identify any shifts in the target variable distribution.\n", + "2. **Feature drift:** We compare the training dataset with the monitoring data. Since features were used to train the model, any drift in these features could indicate potential issues, as the underlying patterns that the model was trained on may have changed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we can examine the correlation between features and predictions. Significant changes in these correlations may trigger a deeper assessment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.PopulationStabilityIndex\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.TargetPredictionDistributionPlot\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 5\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we want see difference in correlation pairs between model prediction and features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.PredictionCorrelation\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 5\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally for target drift, let's plot each prediction value and feature grid side by side." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.PredictionQuantilesAcrossFeatures\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's add run a test to investigate how or if the features have drifted. In this instance we want to compare the training data with prediction data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.FeatureDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"psi_threshold\": 0.2,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Classification accuracy\n", + "\n", + "We now evaluate the model's predictive performance by comparing its behavior between reference and monitoring datasets. These tests analyze shifts in overall accuracy metrics, examine changes in the confusion matrix to identify specific classification pattern changes, and assess the model's probability calibration across different prediction thresholds. \n", + "\n", + "The primary objective is to detect any degradation in the model's classification performance that might indicate reliability issues in production. The tests provide both aggregate performance metrics and detailed breakdowns of prediction patterns, enabling the identification of specific areas where the model's accuracy might be deteriorating." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ClassificationAccuracyDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 5,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ConfusionMatrixDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 5,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.CalibrationCurveDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"n_bins\": 10,\n", + " \"drift_pct_threshold\": 10,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Class discrimination\n", + "\n", + "The following tests assess the model's ability to effectively separate different classes in both reference and monitoring datasets. These tests analyze the model's discriminative power by examining the separation between class distributions, evaluating changes in the ROC curve characteristics, comparing probability distribution patterns, and assessing cumulative prediction trends. \n", + "\n", + "The primary objective is to identify any deterioration in the model's ability to distinguish between classes, which could indicate a decline in model effectiveness. The tests examine both the overall discriminative capability and the granular patterns in prediction distributions, providing insights into whether the model maintains its ability to effectively differentiate between classes in the production environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ClassDiscriminationDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 5,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ROCCurveDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.PredictionProbabilitiesHistogramDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"drift_pct_threshold\": 10,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.CumulativePredictionProbabilitiesDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scoring\n", + "\n", + "Next we analyze the distribution and stability of credit scores across reference and monitoring datasets. These tests evaluate shifts in score distributions, examine changes in score band populations, and assess the relationship between scores and default rates. \n", + "\n", + "The primary objective is to identify any significant changes in how the model assigns credit scores, which could indicate drift in risk assessment capabilities. The tests examine both the overall score distribution patterns and the specific performance within defined score bands, providing insights into whether the model maintains consistent and reliable risk segmentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ScorecardHistogramDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " },\n", + " params={\n", + " \"score_column\": \"xgb_scores\",\n", + " \"drift_pct_threshold\": 20,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.ongoing_monitoring.ScoreBandsDrift\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"score_column\": \"xgb_scores\",\n", + " \"score_bands\": [500, 540, 570],\n", + " \"drift_pct_threshold\": 20,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model insights" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.PermutationFeatureImportance\",\n", + " input_grid={\n", + " \"dataset\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": [vm_xgb_model]\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.FeaturesAUC\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model],\n", + " \"dataset\": [vm_reference_ds, vm_monitoring_ds],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.SHAPGlobalImportance\",\n", + " input_grid={\n", + " \"model\": [vm_xgb_model],\n", + " \"dataset\": [vm_reference_ds, vm_monitoring_ds],\n", + " },\n", + " params={\n", + " \"kernel_explainer_samples\": 10,\n", + " \"tree_or_linear_explainer_samples\": 200,\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Diagnostic monitoring" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.WeakspotsDiagnosis\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.OverfitDiagnosis\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " },\n", + " params={\n", + " \"cut_off_threshold\": 0.04\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Robustness monitoring" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.sklearn.RobustnessDiagnosis\",\n", + " inputs={\n", + " \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n", + " \"model\": vm_xgb_model,\n", + " },\n", + " params={\n", + " \"scaling_factor_std_dev_list\": [\n", + " 0.1,\n", + " 0.2,\n", + " 0.3,\n", + " 0.4,\n", + " 0.5\n", + " ],\n", + " \"performance_decay_threshold\": 0.05\n", + " }\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance history\n", + "\n", + "In this section we showcase how to track and visualize the temporal evolution of key model performance metrics, including AUC, F1 score, precision, recall, and accuracy. For demonstration purposes, the section simulates historical performance data by introducing a gradual downward trend and random noise to these metrics over a specified time period. These tests are useful for analyzing the stability and trends in model performance indicators, helping to identify potential degradation or unexpected fluctuations in model behavior over time. \n", + "\n", + "The main goal is to maintain a continuous record of model performance that can be used to detect gradual drift, sudden changes, or cyclical patterns in model effectiveness. This temporal monitoring approach provides early warning signals of potential issues and helps establish whether the model maintains consistent performance within acceptable boundaries throughout its deployment period." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics = [metric for metric in list_metrics() if \"classification\" in metric]\n", + "\n", + "for metric_id in metrics:\n", + " describe_metric(metric_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_metric(\n", + " \"validmind.unit_metrics.classification.ROC_AUC\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + ")\n", + "auc = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_metric(\n", + " \"validmind.unit_metrics.classification.Accuracy\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + ")\n", + "accuracy = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_metric(\n", + " \"validmind.unit_metrics.classification.Recall\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + ")\n", + "recall = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f1 = run_metric(\n", + " \"validmind.unit_metrics.classification.F1\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + ")\n", + "f1 = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "precision = run_metric(\n", + " \"validmind.unit_metrics.classification.Precision\",\n", + " inputs={\n", + " \"model\": vm_xgb_model,\n", + " \"dataset\": vm_monitoring_ds,\n", + " },\n", + ")\n", + "precision = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "NUM_DAYS = 10\n", + "REFERENCE_DATE = datetime(2024, 1, 1) # Fixed date: January 1st, 2024\n", + "base_date = REFERENCE_DATE - timedelta(days=NUM_DAYS)\n", + "\n", + "\n", + "# Initial values\n", + "performance_metrics = {\n", + " \"AUC Score\": auc,\n", + " \"F1 Score\": f1,\n", + " \"Precision Score\": precision,\n", + " \"Recall Score\": recall,\n", + " \"Accuracy Score\": accuracy\n", + "}\n", + "\n", + "# Trend parameters\n", + "trend_factor = 0.98 # Slight downward trend (multiply by 0.98 each step)\n", + "noise_scale = 0.02 # Random fluctuation of ±2%\n", + "\n", + "\n", + "for i in range(NUM_DAYS):\n", + " recorded_at = base_date + timedelta(days=i)\n", + " print(f\"\\nrecorded_at: {recorded_at}\")\n", + "\n", + " # Log each metric with trend and noise\n", + " for metric_name, base_value in performance_metrics.items():\n", + " # Apply trend and add random noise\n", + " trend = base_value * (trend_factor ** i)\n", + " noise = np.random.normal(0, noise_scale * base_value)\n", + " value = max(0, min(1, trend + noise)) # Ensure value stays between 0 and 1\n", + " \n", + " log_metric(\n", + " key=metric_name,\n", + " value=value,\n", + " recorded_at=recorded_at.isoformat()\n", + " )\n", + " \n", + " print(f\"{metric_name:<15}: {value:.4f}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-eEL8LtKG-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/how_to/add_context_to_llm_descriptions.ipynb b/site/notebooks/how_to/add_context_to_llm_descriptions.ipynb new file mode 100644 index 0000000000..6951e52858 --- /dev/null +++ b/site/notebooks/how_to/add_context_to_llm_descriptions.ipynb @@ -0,0 +1,1168 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Add context to LLM-generated test descriptions\n", + "\n", + "When you run ValidMind tests, test descriptions are automatically generated with LLM using the test results, the test name, and the static test definitions provided in the test's docstring. While this metadata offers valuable high-level overviews of tests, insights produced by the LLM-based descriptions may not always align with your specific use cases or incorporate organizational policy requirements.\n", + "\n", + "In this notebook, you'll learn how to add context to the generated descriptions by providing additional information about the test or the use case. Including custom use case context is useful when you want to highlight information about the intended use and technique of the model, or the insitution policies and standards specific to your use case." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [Install the ValidMind Library](#toc1_) \n", + "- [Initialize the ValidMind Library](#toc2_) \n", + " - [Get your code snippet](#toc2_1_) \n", + "- [Initialize the Python environment](#toc3_) \n", + "- [Load the sample dataset](#toc4_) \n", + " - [Preprocess the raw dataset](#toc4_1_) \n", + "- [Initialize the ValidMind objects](#toc5_) \n", + " - [Initialize the datasets](#toc5_1_) \n", + " - [Initialize a model object](#toc5_2_) \n", + " - [Assign predictions to the datasets](#toc5_3_) \n", + "- [Set custom context for test descriptions](#toc6_) \n", + " - [Review default LLM-generated descriptions](#toc6_1_) \n", + " - [Enable use case context](#toc6_2_) \n", + " - [Disable use case context](#toc6_2_1_) \n", + " - [Add test-specific context](#toc6_3_) \n", + " - [Dataset Description](#toc6_3_1_) \n", + " - [Class Imbalance](#toc6_3_2_) \n", + " - [High Cardinality](#toc6_3_3_) \n", + " - [Missing Values](#toc6_3_4_) \n", + " - [Unique Rows](#toc6_3_5_) \n", + " - [Too Many Zero Values](#toc6_3_6_) \n", + " - [IQR Outliers Table](#toc6_3_7_) \n", + " - [Descriptive Statistics](#toc6_3_8_) \n", + " - [Pearson Correlation Matrix](#toc6_3_9_) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Initialize the ValidMind Library\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "### Get your code snippet\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Model Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + " For example, to register a model for use with this notebook, select:\n", + "\n", + " - Documentation template: `Binary classification`\n", + " - Use case: `Marketing/Sales - Attrition/Churn Management`\n", + "\n", + " You can fill in other options according to your preference.\n", + "\n", + "4. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", + " # api_key = \"...\",\n", + " # api_secret = \"...\",\n", + " # model = \"...\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Initialize the Python environment\n", + "\n", + "After you've connected to your model register in the ValidMind Platform, let's import the necessary libraries and set up your Python environment for data analysis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "import os\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Load the sample dataset\n", + "\n", + "First, we'll import a sample ValidMind dataset and load it into a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), a two-dimensional tabular data structure that makes use of rows and columns:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the sample dataset from the library\n", + "\n", + "from validmind.datasets.classification import customer_churn\n", + "\n", + "print(\n", + " f\"Loaded demo dataset with: \\n\\n\\t• Target column: '{customer_churn.target_column}' \\n\\t• Class labels: {customer_churn.class_labels}\"\n", + ")\n", + "\n", + "raw_df = customer_churn.load_data()\n", + "raw_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Preprocess the raw dataset\n", + "\n", + "Then, we'll perform a number of operations to get ready for the subsequent steps:\n", + "\n", + "- **Preprocess the data:** Splits the DataFrame (`df`) into multiple datasets (`train_df`, `validation_df`, and `test_df`) using `demo_dataset.preprocess` to simplify preprocessing.\n", + "- **Separate features and targets:** Drops the target column to create feature sets (`x_train`, `x_val`) and target sets (`y_train`, `y_val`).\n", + "- **Initialize XGBoost classifier:** Creates an `XGBClassifier` object with early stopping rounds set to 10.\n", + "- **Set evaluation metrics:** Specifies metrics for model evaluation as `error`, `logloss`, and `auc`.\n", + "- **Fit the model:** Trains the model on `x_train` and `y_train` using the validation set `(x_val, y_val)`. Verbose output is disabled." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df, validation_df, test_df = customer_churn.preprocess(raw_df)\n", + "\n", + "x_train = train_df.drop(customer_churn.target_column, axis=1)\n", + "y_train = train_df[customer_churn.target_column]\n", + "x_val = validation_df.drop(customer_churn.target_column, axis=1)\n", + "y_val = validation_df[customer_churn.target_column]\n", + "\n", + "model = xgb.XGBClassifier(early_stopping_rounds=10)\n", + "model.set_params(\n", + " eval_metric=[\"error\", \"logloss\", \"auc\"],\n", + ")\n", + "model.fit(\n", + " x_train,\n", + " y_train,\n", + " eval_set=[(x_val, y_val)],\n", + " verbose=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Initialize the ValidMind objects" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the datasets\n", + "\n", + "Before you can run tests, you'll need to initialize a ValidMind dataset object using the [`init_dataset`](https://docs.validmind.ai/validmind/validmind.html#init_dataset) function from the ValidMind (`vm`) module.\n", + "\n", + "We'll include the following arguments:\n", + "\n", + "- **`dataset`** — the raw dataset that you want to provide as input to tests\n", + "- **`input_id`** - a unique identifier that allows tracking what inputs are used when running each individual test\n", + "- **`target_column`** — a required argument if tests require access to true values. This is the name of the target column in the dataset\n", + "- **`class_labels`** — an optional value to map predicted classes to class labels\n", + "\n", + "With all datasets ready, you can now initialize the raw, training, and test datasets (`raw_df`, `train_df` and `test_df`) created earlier into their own dataset objects using [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_raw_dataset = vm.init_dataset(\n", + " dataset=raw_df,\n", + " input_id=\"raw_dataset\",\n", + " target_column=customer_churn.target_column,\n", + " class_labels=customer_churn.class_labels,\n", + ")\n", + "\n", + "vm_train_ds = vm.init_dataset(\n", + " dataset=train_df,\n", + " input_id=\"train_dataset\",\n", + " target_column=customer_churn.target_column,\n", + ")\n", + "\n", + "vm_test_ds = vm.init_dataset(\n", + " dataset=test_df, input_id=\"test_dataset\", target_column=customer_churn.target_column\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize a model object\n", + "\n", + "Additionally, you'll need to initialize a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data. \n", + "\n", + "Simply intialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_model = vm.init_model(\n", + " model,\n", + " input_id=\"model\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign predictions to the datasets\n", + "\n", + "We can now use the `assign_predictions()` method from the Dataset object to link existing predictions to any model.\n", + "\n", + "If no prediction values are passed, the method will compute predictions automatically:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds.assign_predictions(\n", + " model=vm_model,\n", + ")\n", + "\n", + "vm_test_ds.assign_predictions(\n", + " model=vm_model,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Set custom context for test descriptions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Review default LLM-generated descriptions\n", + "\n", + "By default, custom context for LLM-generated descriptions is disabled, meaning that the output will not include any additional context.\n", + "\n", + "Let's generate an initial test description for the `DatasetDescription` test for comparision with later iterations:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.data_validation.DatasetDescription\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Enable use case context\n", + "\n", + "To enable custom use case context, set the `VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED` environment variable to `1`.\n", + "\n", + "This is a global setting that will affect all tests for your linked model:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED\"] = \"1\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Enabling use case context allows you to pass in additional context, such as information about your model, relevant regulatory requirements, or model validation targets to the LLM-generated text descriptions within `use_case_context`:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "use_case_context = \"\"\"\n", + "\n", + "This is a customer churn prediction model for a banking loan application system using XGBoost classifier. \n", + "\n", + "Key Model Information:\n", + "- Use Case: Predict customer churn risk during loan application process\n", + "- Model Type: Binary classification using XGBoost\n", + "- Critical Decision Point: Used in loan approval workflow\n", + "\n", + "Regulatory Requirements:\n", + "- Subject to model risk management review and validation\n", + "- Results require validation review for regulatory compliance\n", + "- Model decisions directly impact loan approval process\n", + "- Does this result raise any regulatory concerns?\n", + "\n", + "Validation Focus:\n", + "- Explain strengths and weaknesses of the test and the context of whether the result is acceptable.\n", + "- What does the result indicate about model reliability?\n", + "- Is the result within acceptable thresholds for loan decisioning?\n", + "- What are the implications for customer impact?\n", + "\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = use_case_context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the use case context set, generate an updated test description for the `DatasetDescription` test for comparision with default output:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.data_validation.DatasetDescription\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Disable use case context\n", + "\n", + "To disable custom use case context, set the `VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED` environment variable to `0`.\n", + "\n", + "This is a global setting that will affect all tests for your linked model:" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED\"] = \"0\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the use case context disabled again, generate another test description for the `DatasetDescription` test for comparision with previous custom output:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.data_validation.DatasetDescription\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Add test-specific context\n", + "\n", + "In addition to the model-level `use_case_context`, you're able to add test-specific context to your LLM-generated descriptions allowing you to provide test-specific validation criteria about the test that is being run.\n", + "\n", + "We'll reenable use case context by setting the `VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED` environment variable to `1`, then join the test-specific context to the use case context using the `VALIDMIND_LLM_DESCRIPTIONS_CONTEXT` environment variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED\"] = \"1\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Dataset Description\n", + "\n", + "Rather than relying on generic dataset result descriptions in isolation, we'll use the context to specify precise thresholds for missing values, appropriate data types for banking variables (like `CreditScore` and `Balance`), and valid value ranges based on particular business rules:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "test_context = \"\"\"\n", + "\n", + "Acceptance Criteria:\n", + "- Missing Values: All critical features must have less than 5% missing values (including CreditScore, Balance, Age)\n", + "- Data Types: All columns must have appropriate data types (numeric for CreditScore/Balance/Age, categorical for Geography/Gender)\n", + "- Cardinality: Categorical variables must have fewer than 50 unique values, while continuous variables should show appropriate distinct value counts (e.g., high for EstimatedSalary, exactly 2 for Boolean fields)\n", + "- Value Ranges: Numeric fields must fall within business-valid ranges (CreditScore: 300-850, Age: ≥18, Balance: ≥0)\n", + "\"\"\".strip()\n", + "\n", + "context = f\"\"\"\n", + "{use_case_context}\n", + "\n", + "{test_context}\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the test-specific context set, generate an updated test description for the `DatasetDescription` test again:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.data_validation.DatasetDescription\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Class Imbalance\n", + "\n", + "The following test-specific context example adds value to the LLM-generated description by providing defined risk levels to assess class representation:\n", + "\n", + "- By categorizing classes into `Low`, `Medium`, and `High Risk`, the LLM can generate more nuanced and actionable insights, ensuring that the analysis aligns with business requirements for balanced datasets.\n", + "- This approach not only highlights potential issues but also guides necessary documentation and mitigation strategies for high-risk classes." + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "test_context = \"\"\"\n", + "\n", + "Acceptance Criteria:\n", + "\n", + "• Risk Levels for Class Representation:\n", + " - Low Risk: Each class represents 20% or more of the total dataset\n", + " - Medium Risk: Each class represents between 10% and 19.9% of the total dataset\n", + " - High Risk: Any class represents less than 10% of the total dataset\n", + "\n", + "• Overall Requirement:\n", + " - All classes must achieve at least Medium Risk status to pass\n", + "\"\"\".strip()\n", + "\n", + "context = f\"\"\"\n", + "{use_case_context}\n", + "\n", + "{test_context}\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the test-specific context set, generate a test description for the `ClassImbalance` test for review:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.data_validation.ClassImbalance\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params={\n", + " \"min_percent_threshold\": 10,\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### High Cardinality\n", + "\n", + "In this below case, the context specifies a risk-based criteria for the number of distinct values in categorical features.\n", + "\n", + "This helps the LLM to generate more nuanced and actionable insights, ensuring the descriptions are more relevant to your organization's policies." + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "test_context = \"\"\"\n", + "\n", + "Acceptance Criteria:\n", + "\n", + "• Risk Levels for Distinct Values in Categorical Features:\n", + " - Low Risk: Each categorical column has fewer than 50 distinct values or less than 5% unique values relative to the total dataset size\n", + " - Medium Risk: Each categorical column has between 50 and 100 distinct values or between 5% and 10% unique values\n", + " - High Risk: Any categorical column has more than 100 distinct values or more than 10% unique values\n", + "\n", + "• Overall Requirement:\n", + " - All categorical columns must achieve at least Medium Risk status to pass\n", + "\"\"\".strip()\n", + "\n", + "context = f\"\"\"\n", + "{use_case_context}\n", + "\n", + "{test_context}\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the test-specific context set, generate a test description for the `HighCardinality` test for review:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.data_validation.HighCardinality\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params= {\n", + " \"num_threshold\": 100,\n", + " \"percent_threshold\": 0.1,\n", + " \"threshold_type\": \"percent\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Missing Values\n", + "\n", + "Here, we use the test-specific context to establish differentiated risk thresholds across features.\n", + "\n", + "Rather than applying uniform criteria, the context allows for specific requirements for critical financial features (`CreditScore`, `Balance`, `Age`)." + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "test_context = \"\"\"\n", + "Test-Specific Context for Missing Values Analysis:\n", + "\n", + "Acceptance Criteria:\n", + "\n", + "• Risk Levels for Missing Values:\n", + " - Low Risk: Less than 1% missing values in any column\n", + " - Medium Risk: Between 1% and 5% missing values\n", + " - High Risk: More than 5% missing values\n", + "\n", + "• Feature-Specific Requirements:\n", + " - Critical Features (CreditScore, Balance, Age):\n", + " * Must maintain Low Risk status\n", + " * No missing values allowed\n", + " \n", + " - Secondary Features (Tenure, NumOfProducts, EstimatedSalary):\n", + " * Must achieve at least Medium Risk status\n", + " * Up to 3% missing values acceptable\n", + "\n", + " - Categorical Features (Geography, Gender):\n", + " * Must achieve at least Medium Risk status\n", + " * Up to 5% missing values acceptable\n", + "\"\"\".strip()\n", + "\n", + "context = f\"\"\"\n", + "{use_case_context}\n", + "\n", + "{test_context}\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the test-specific context set, generate a test description for the `MissingValues` test for review:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.data_validation.MissingValues\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params= {\n", + " \"min_threshold\": 1\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Unique Rows\n", + "\n", + "This example context establishes variable-specific thresholds based on business expectations.\n", + "\n", + "Rather than applying uniform criteria, it recognizes that high variability is expected in features like `EstimatedSalary` (>90%) and `Balance` (>50%), while enforcing strict limits on categorical features like `Geography` (<5 values), ensuring meaningful validation aligned with banking data characteristics." + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "test_context = \"\"\"\n", + "\n", + "Acceptance Criteria:\n", + "\n", + "• High-Variability Expected Features:\n", + " - EstimatedSalary: Must have >90% unique values\n", + " - Balance: Must have >50% unique values\n", + " - CreditScore: Must have between 5-10% unique values\n", + "\n", + "• Medium-Variability Features:\n", + " - Age: Should have between 0.5-2% unique values\n", + " - Tenure: Should have between 0.1-0.5% unique values\n", + "\n", + "• Low-Variability Features:\n", + " - Binary Features (HasCrCard, IsActiveMember, Gender, Exited): Must have exactly 2 unique values\n", + " - Geography: Must have fewer than 5 unique values\n", + " - NumOfProducts: Must have fewer than 10 unique values\n", + "\n", + "• Overall Requirements:\n", + " - Features must fall within their specified ranges to pass\n", + "\"\"\".strip()\n", + "\n", + "context = f\"\"\"\n", + "{use_case_context}\n", + "\n", + "{test_context}\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the test-specific context set, generate a test description for the `UniqueRows` test for review:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.data_validation.UniqueRows\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params= {\n", + " \"min_percent_threshold\": 1\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Too Many Zero Values\n", + "\n", + "Here, test-specific context is used to provide meaning and expectations for different variables:\n", + "\n", + "- For instance, zero values in `Balance` and `Tenure` indicate risk, whereas zeros in binary variables like `HasCrCard` or `IsActiveMember` are expected.\n", + "- This tailored context ensures that the analysis accurately reflects the business significance of zero values across different features." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "test_context = \"\"\"\n", + "\n", + "Acceptance Criteria:\n", + "- Numerical Features Only: Test evaluates only continuous numeric columns (Balance, Tenure), \n", + " excluding binary columns (HasCrCard, IsActiveMember)\n", + "\n", + "- Risk Level Thresholds for Balance and Tenure:\n", + " - High Risk: More than 5% zero values\n", + " - Medium Risk: Between 3% and 5% zero values\n", + " - Low Risk: Less than 3% zero values\n", + "\n", + "- Individual Column Requirements:\n", + " - Balance: Must be Low Risk (banking context requires accurate balance tracking)\n", + " - Tenure: Must be Low or Medium Risk (some zero values acceptable for new customers)\n", + "\n", + "• Overall Test Result: Test must achieve \"Pass\" status (Low Risk) for Balance, and at least Medium Risk for Tenure\n", + "\n", + "\"\"\".strip()\n", + "\n", + "context = f\"\"\"\n", + "{use_case_context}\n", + "\n", + "{test_context}\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the test-specific context set, generate a test description for the `TooManyZeroValues` test for review:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.data_validation.TooManyZeroValues\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params= {\n", + " \"max_percent_threshold\": 0.03\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### IQR Outliers Table\n", + "\n", + "In this case, we use test-specific context to incorporate risk levels tailored to key variables, like `CreditScore`, `Age`, and `NumOfProducts`, that otherwise would not be considered for outlier analysis if we ran the test without context where all variables would be evaluated without any business criteria." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "test_context = \"\"\"\n", + "\n", + "Acceptance Criteria:\n", + "- Risk Levels for Outliers:\n", + " - Low Risk: 0-50 outliers\n", + " - Medium Risk: 51-300 outliers\n", + " - High Risk: More than 300 outliers\n", + "- Feature-Specific Requirements:\n", + " - CreditScore, Age, NumOfProducts: Must maintain Low Risk status to ensure data quality and model reliability\n", + "\n", + "\"\"\".strip()\n", + "\n", + "context = f\"\"\"\n", + "{use_case_context}\n", + "\n", + "{test_context}\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the test-specific context set, generate a test description for the `IQROutliersTable` test for review:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.data_validation.IQROutliersTable\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + " params= {\n", + " \"threshold\": 1.5\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Descriptive Statistics\n", + "\n", + "Test-specific context is used in this case to provide risk-based thresholds aligned with the bank's policy.\n", + "\n", + "For instance, `CreditScore` ranges of 550-850 are considered low risk based on standard credit assessment practices, while `Balance` thresholds reflect typical retail banking ranges." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "test_context = \"\"\"\n", + "\n", + "Acceptance Criteria:\n", + "\n", + "• CreditScore:\n", + " - Low Risk: 550-850\n", + " - Medium Risk: 450-549\n", + " - High Risk: <450 or missing\n", + " - Justification: Banking standards require reliable credit assessment\n", + "\n", + "• Age:\n", + " - Low Risk: 18-75\n", + " - Medium Risk: 76-85\n", + " - High Risk: >85 or <18\n", + " - Justification: Core banking demographic with age-appropriate products\n", + "\n", + "• Balance:\n", + " - Low Risk: 0-200,000\n", + " - Medium Risk: 200,001-250,000\n", + " - High Risk: >250,000\n", + " - Justification: Typical retail banking balance ranges\n", + "\n", + "• Tenure:\n", + " - Low Risk: 1-10 years\n", + " - Medium Risk: <1 year\n", + " - High Risk: 0 or >10 years\n", + " - Justification: Expected customer relationship duration\n", + "\n", + "• EstimatedSalary:\n", + " - Low Risk: 25,000-150,000\n", + " - Medium Risk: 150,001-200,000\n", + " - High Risk: <25,000 or >200,000\n", + " - Justification: Typical income ranges for retail banking customers\n", + "\n", + "\"\"\".strip()\n", + "\n", + "context = f\"\"\"\n", + "{use_case_context}\n", + "\n", + "{test_context}\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the test-specific context set, generate a test description for the `DescriptiveStatistics` test for review:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.data_validation.DescriptiveStatistics\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Pearson Correlation Matrix\n", + "\n", + "For this test, the context provides meaningful correlation ranges between specific variable pairs based on business criteria.\n", + "\n", + "For example, while a general correlation analysis might flag any correlation above 0.7 as concerning, the test-specific context specifies that `Balance` and `NumOfProducts` should maintain a negative correlation between -0.4 and 0, reflecting expected banking relationships." + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "test_context = \"\"\"\n", + "\n", + "Acceptance Criteria:\n", + "\n", + "• Target Variable Correlations (Exited):\n", + " - Must show correlation coefficients between ±0.1 and ±0.3 with Age, CreditScore, and Balance\n", + " - Should not exceed ±0.2 correlation with other features\n", + " - Justification: Ensures predictive power while avoiding target leakage\n", + "\n", + "• Feature Correlations:\n", + " - Balance & NumOfProducts: Must maintain correlation between -0.4 and 0\n", + " - Age & Tenure: Should show positive correlation between 0.1 and 0.3\n", + " - CreditScore & Balance: Should maintain correlation between 0.1 and 0.3\n", + "\n", + "• Binary Feature Correlations:\n", + " - HasCreditCard & IsActiveMember: Must not exceed ±0.15 correlation\n", + " - Binary features should not show strong correlations (>±0.2) with continuous features\n", + "\n", + "• Overall Requirement:\n", + " - No feature pair should exceed ±0.7 correlation to avoid multicollinearity\n", + "\n", + "\"\"\".strip()\n", + "\n", + "context = f\"\"\"\n", + "{use_case_context}\n", + "\n", + "{test_context}\n", + "\"\"\".strip()\n", + "\n", + "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the test-specific context set, generate a test description for the `PearsonCorrelationMatrix` test for review:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.tests.run_test(\n", + " \"validmind.data_validation.PearsonCorrelationMatrix\",\n", + " inputs={\n", + " \"dataset\": vm_raw_dataset,\n", + " },\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-py3.10", + "language": "python", + "name": "validmind-py3.10" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/how_to/log_metrics_over_time.ipynb b/site/notebooks/how_to/log_metrics_over_time.ipynb new file mode 100644 index 0000000000..9cef4c5402 --- /dev/null +++ b/site/notebooks/how_to/log_metrics_over_time.ipynb @@ -0,0 +1,720 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Log metrics over time\n", + "\n", + "Learn how to track and visualize the temporal evolution of key model performance metrics with ValidMind.\n", + "\n", + "- Key model performance metrics such as AUC, F1 score, precision, recall, and accuracy, are useful for analyzing the stability and trends in model performance indicators, helping to identify potential degradation or unexpected fluctuations in model behavior over time.\n", + "- By monitoring these metrics systematically, teams can detect early warning signs of model drift and take proactive measures to maintain model reliability.\n", + "- Unit metrics in ValidMind provide a standardized way to compute and track individual performance measures, making it easy to monitor specific aspects of model behavior.\n", + "\n", + "Log metrics over time with the ValidMind Library's [`log_metric()`](https://docs.validmind.ai/validmind/validmind.html#log_metric) function and visualize them in your documentation using the *Metric Over Time* block within the ValidMind Platform. This integration enables seamless tracking of model performance, supporting custom thresholds and facilitating the automation of alerts based on logged metrics.\n", + "\n", + "
Metrics over time are most commonly associated with the continued monitoring of a model's performance once it is deployed.\n", + "

\n", + "While you are able to add Metric Over Time blocks to model documentation, we recommend first enabling ongoing monitoring for your model to maximize the potential of your performance data.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1_) \n", + " - [Before you begin](#toc1_1_) \n", + " - [New to ValidMind?](#toc1_2_) \n", + " - [Key concepts](#toc1_3_) \n", + "- [Install the ValidMind Library](#toc2_) \n", + "- [Initialize the ValidMind Library](#toc3_) \n", + " - [Get your code snippet](#toc3_1_) \n", + "- [Initialize the Python environment](#toc4_) \n", + "- [Load demo model](#toc5_) \n", + "- [Log metrics](#toc6_) \n", + " - [Run unit metrics](#toc6_1_) \n", + " - [Log unit metrics over time](#toc6_2_) \n", + " - [Pass thresholds](#toc6_3_) \n", + " - [Log multiple metrics with custom thresholds](#toc6_4_) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## About ValidMind\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language.\n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "If you haven't already seen our [Get started with the ValidMind Library](https://docs.validmind.ai/developer/get-started-validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n", + "\n", + "
For access to all features available in this notebook, create a free ValidMind account.\n", + "

\n", + "Signing up is FREE — Register with ValidMind
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: The [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Initialize the ValidMind Library\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "### Get your code snippet\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Model Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + " For example, to register a model for use with this notebook, select:\n", + "\n", + " - Documentation template: `Credit Risk Scorecard`\n", + " - Use case: `Credit Risk - CECL`\n", + "\n", + " You can fill in other options according to your preference.\n", + "\n", + "4. Go to **Getting Started** and click **Copy snippet to clipboard**.\n", + "\n", + "Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + " monitoring = True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Initialize the Python environment\n", + "\n", + "Next, let's import the necessary libraries and set up your Python environment for data analysis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "import numpy as np\n", + "\n", + "from datetime import datetime, timedelta\n", + "\n", + "from validmind.unit_metrics import list_metrics, describe_metric, run_metric\n", + "from validmind.api_client import log_metric\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Load demo model\n", + "\n", + "We'll use a classification model trained on customer churn data to demonstrate ValidMind's metric logging capabilities.\n", + "\n", + "- We'll employ a built-in classification dataset, process it through train-validation-test splits, and train an XGBoost classifier.\n", + "- The trained model and datasets are then initialized in ValidMind's framework, enabling us to track and monitor various performance metrics in the following sections." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the sample dataset from the library\n", + "\n", + "from validmind.datasets.classification import customer_churn\n", + "\n", + "print(\n", + " f\"Loaded demo dataset with: \\n\\n\\t• Target column: '{customer_churn.target_column}' \\n\\t• Class labels: {customer_churn.class_labels}\"\n", + ")\n", + "\n", + "raw_df = customer_churn.load_data()\n", + "raw_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df, validation_df, test_df = customer_churn.preprocess(raw_df)\n", + "\n", + "x_train = train_df.drop(customer_churn.target_column, axis=1)\n", + "y_train = train_df[customer_churn.target_column]\n", + "x_val = validation_df.drop(customer_churn.target_column, axis=1)\n", + "y_val = validation_df[customer_churn.target_column]\n", + "\n", + "model = xgb.XGBClassifier(early_stopping_rounds=10)\n", + "model.set_params(\n", + " eval_metric=[\"error\", \"logloss\", \"auc\"],\n", + ")\n", + "model.fit(\n", + " x_train,\n", + " y_train,\n", + " eval_set=[(x_val, y_val)],\n", + " verbose=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once the datasets and model are prepared for validation, let's initialize the ValidMind `dataset` and `model`, specifying features and targets columns.\n", + "\n", + "- The property `input_id` allows users to uniquely identify each dataset and model.\n", + "- This allows for the creation of multiple versions of datasets and models, enabling us to compute metrics by specifying which versions we want to use as inputs." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "vm_raw_dataset = vm.init_dataset(\n", + " dataset=raw_df,\n", + " input_id=\"raw_dataset\",\n", + " target_column=customer_churn.target_column,\n", + " class_labels=customer_churn.class_labels,\n", + ")\n", + "\n", + "vm_train_ds = vm.init_dataset(\n", + " dataset=train_df,\n", + " input_id=\"train_dataset\",\n", + " target_column=customer_churn.target_column,\n", + ")\n", + "\n", + "vm_test_ds = vm.init_dataset(\n", + " dataset=test_df, input_id=\"test_dataset\", target_column=customer_churn.target_column\n", + ")\n", + "\n", + "vm_model = vm.init_model(\n", + " model,\n", + " input_id=\"model\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now use the `assign_predictions()` method from the Dataset object to link existing predictions to any model. \n", + "\n", + "If no prediction values are passed, the method will compute predictions automatically:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds.assign_predictions(\n", + " model=vm_model,\n", + ")\n", + "\n", + "vm_test_ds.assign_predictions(\n", + " model=vm_model,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Log metrics\n", + "\n", + "Next, we'll use ValidMind to track the temporal evolution of key model performance metrics.\n", + "\n", + "We'll set appropriate thresholds for each metric, enable automated alerting when performance drifts beyond acceptable boundaries, and demonstrate how these thresholds can be customized based on business requirements and risk tolerance levels." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics = [metric for metric in list_metrics() if \"classification\" in metric]\n", + "\n", + "for metric_id in metrics:\n", + " describe_metric(metric_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run unit metrics\n", + "\n", + "Compute individual metrics using ValidMind's *unit metrics* — single-value metrics that can be computed on a dataset and model. Use the `run_metric()` function from the `validmind.unit_metrics` module to calculate these metrics.\n", + "\n", + "The `run_metric()` function has a signature similar to `run_test()` from the `validmind.tests` module, but is specifically designed for unit metrics and takes the following arguments:\n", + "\n", + "- **`metric_id`:** The unique identifier for the metric (for example, `validmind.unit_metrics.classification.ROC_AUC`)\n", + "- **`inputs`:** A dictionary containing the input dataset and model or their respective input IDs\n", + "- **`params`:** A dictionary containing keyword arguments for the unit metric (optional, accepts any `kwargs` from the underlying sklearn implementation)\n", + "\n", + "`run_metric()` returns and displays a result object similar to a regular ValidMind test, but only shows the unit metric value. While this result object has a `.log()` method for logging to the ValidMind Platform, in this use case we'll use unit metrics to compute performance metrics and then log them over time using the `log_metric()` function from the `validmind.api_client` module." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_metric(\n", + " \"validmind.unit_metrics.classification.ROC_AUC\",\n", + " inputs={\n", + " \"model\": vm_model,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ")\n", + "auc = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_metric(\n", + " \"validmind.unit_metrics.classification.Accuracy\",\n", + " inputs={\n", + " \"model\": vm_model,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ")\n", + "accuracy = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_metric(\n", + " \"validmind.unit_metrics.classification.Recall\",\n", + " inputs={\n", + " \"model\": vm_model,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ")\n", + "recall = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f1 = run_metric(\n", + " \"validmind.unit_metrics.classification.F1\",\n", + " inputs={\n", + " \"model\": vm_model,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ")\n", + "f1 = result.metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "precision = run_metric(\n", + " \"validmind.unit_metrics.classification.Precision\",\n", + " inputs={\n", + " \"model\": vm_model,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ")\n", + "precision = result.metric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Log unit metrics over time\n", + "\n", + "Using the `log_metric()` function from the `validmind.api_client` module, let's log the unit metrics over time. This function takes the following arguments:\n", + "\n", + "- **`key`:** The name of the metric to log\n", + "- **`value`:** The value of the metric to log\n", + "- **`recorded_at`:** The timestamp of the metric to log — useful for logging historic predictions\n", + "- **`thresholds`:** A dictionary containing the thresholds for the metric to log\n", + "- **`params`:** A dictionary containing the keyword arguments for the unit metric (in this case, none are required, but we can pass any `kwargs` that the underlying sklearn implementation accepts)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "log_metric(\n", + " key=\"AUC Score\",\n", + " value=auc,\n", + " # If `recorded_at` is not included, the time at function run is logged\n", + " recorded_at=datetime(2024, 1, 1), \n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To visualize the logged metric, we'll use the **[Metrics Over Time block](https://docs.validmind.ai/guide/monitoring/work-with-metrics-over-time.html)** in the ValidMind Platform:\n", + "\n", + "- After adding this visualization block to your documentation or ongoing monitoring report (as shown in the image below), you'll be able to review your logged metrics plotted over time.\n", + "- In this example, since we've only logged a single data point, the visualization shows just one measurement.\n", + "- As you continue logging metrics, the graph will populate with more points, enabling you to track trends and patterns.\n", + "\n", + "![Metric Over Time block](../images/add_metric_over_time_block.png)\n", + "![AUC Score](../images/log_metric_auc_1.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Pass thresholds\n", + "\n", + "We can pass *thresholds* to the `log_metric()` function to enhance the metric over time: \n", + "\n", + "- This is useful for visualizing the metric over time and identifying potential issues. \n", + "- The metric visualization component provides a dynamic way to monitor and contextualize metric values through customizable thresholds. \n", + "- These thresholds appear as horizontal reference lines on the chart. \n", + "- The system always displays the most recent threshold configuration, meaning that if you update threshold values in your client application, the visualization will reflect these changes immediately. \n", + "\n", + "When a metric is logged without thresholds or with an empty threshold dictionary, the reference lines gracefully disappear from the chart, though the metric line itself remains visible. \n", + "\n", + "Thresholds are highly flexible in their implementation. You can define them with any meaningful key names (such as `low_risk`, `maximum`, `target`, or `acceptable_range`) in your metric data, and the visualization will adapt accordingly. " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "log_metric(\n", + " key=\"AUC Score\",\n", + " value=auc,\n", + " recorded_at=datetime(2024, 1, 1),\n", + " thresholds={\n", + " \"min_auc\": 0.7,\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![AUC Score](../images/log_metric_auc_2.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "log_metric(\n", + " key=\"AUC Score\",\n", + " value=auc,\n", + " recorded_at=datetime(2024, 1, 1),\n", + " thresholds={\n", + " \"high_risk\": 0.6,\n", + " \"medium_risk\": 0.7,\n", + " \"low_risk\": 0.8,\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![AUC Score](../images/log_metric_auc_3.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Log multiple metrics with custom thresholds\n", + "\n", + "The following code snippet shows an example of how to set up and log multiple performance metrics with custom thresholds for each metric:\n", + "\n", + "- Using AUC, F1, Precision, Recall, and Accuracy scores as examples, it demonstrates how to define different risk levels (high, medium, low) appropriate for each metric's expected range.\n", + "- The code simulates 10 days of metric history by applying a gradual decay and random noise to help visualize how metrics might drift over time in a production environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "NUM_DAYS = 10\n", + "REFERENCE_DATE = datetime(2024, 1, 1) # Fixed date: January 1st, 2024\n", + "base_date = REFERENCE_DATE - timedelta(days=NUM_DAYS)\n", + "\n", + "# Initial values with their specific thresholds\n", + "performance_metrics = {\n", + " \"AUC Score\": {\n", + " \"value\": auc,\n", + " \"thresholds\": {\n", + " \"high_risk\": 0.7,\n", + " \"medium_risk\": 0.8,\n", + " \"low_risk\": 0.9,\n", + " }\n", + " },\n", + " \"F1 Score\": {\n", + " \"value\": f1,\n", + " \"thresholds\": {\n", + " \"high_risk\": 0.5,\n", + " \"medium_risk\": 0.6,\n", + " \"low_risk\": 0.7,\n", + " }\n", + " },\n", + " \"Precision Score\": {\n", + " \"value\": precision,\n", + " \"thresholds\": {\n", + " \"high_risk\": 0.6,\n", + " \"medium_risk\": 0.7,\n", + " \"low_risk\": 0.8,\n", + " }\n", + " },\n", + " \"Recall Score\": {\n", + " \"value\": recall,\n", + " \"thresholds\": {\n", + " \"high_risk\": 0.4,\n", + " \"medium_risk\": 0.5,\n", + " \"low_risk\": 0.6,\n", + " }\n", + " },\n", + " \"Accuracy Score\": {\n", + " \"value\": accuracy,\n", + " \"thresholds\": {\n", + " \"high_risk\": 0.75,\n", + " \"medium_risk\": 0.8,\n", + " \"low_risk\": 0.85,\n", + " }\n", + " }\n", + "}\n", + "\n", + "# Trend parameters\n", + "trend_factor = 0.98 # Slight downward trend\n", + "noise_scale = 0.02 # Random fluctuation of ±2%\n", + "\n", + "for i in range(NUM_DAYS):\n", + " recorded_at = base_date + timedelta(days=i)\n", + " print(f\"\\nrecorded_at: {recorded_at}\")\n", + "\n", + " # Log each metric with trend and noise\n", + " for metric_name, metric_info in performance_metrics.items():\n", + " base_value = metric_info[\"value\"]\n", + " thresholds = metric_info[\"thresholds\"]\n", + " \n", + " # Apply trend and add random noise\n", + " trend = base_value * (trend_factor ** i)\n", + " noise = np.random.normal(0, noise_scale * base_value)\n", + " value = max(0, min(1, trend + noise)) # Ensure value stays between 0 and 1\n", + " \n", + " log_metric(\n", + " key=metric_name,\n", + " value=value,\n", + " recorded_at=recorded_at.isoformat(),\n", + " thresholds=thresholds\n", + " )\n", + " \n", + " print(f\"{metric_name:<15}: {value:.4f} (Thresholds: {thresholds})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![AUC Score](../images/log_metric_auc_4.png)\n", + "![Accuracy Score](../images/log_metric_accuracy.png)\n", + "![Precision Score](../images/log_metric_precision.png)\n", + "![Recall Score](../images/log_metric_recall.png)\n", + "![F1 Score](../images/log_metric_f1.png)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ValidMind Library", + "language": "python", + "name": "validmind" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/site/notebooks/images/add_metric_over_time_block.png b/site/notebooks/images/add_metric_over_time_block.png new file mode 100644 index 0000000000..5ddaa84faa Binary files /dev/null and b/site/notebooks/images/add_metric_over_time_block.png differ diff --git a/site/notebooks/images/log_metric_accuracy.png b/site/notebooks/images/log_metric_accuracy.png new file mode 100644 index 0000000000..6d47a55c89 Binary files /dev/null and b/site/notebooks/images/log_metric_accuracy.png differ diff --git a/site/notebooks/images/log_metric_auc_1.png b/site/notebooks/images/log_metric_auc_1.png new file mode 100644 index 0000000000..767da49a51 Binary files /dev/null and b/site/notebooks/images/log_metric_auc_1.png differ diff --git a/site/notebooks/images/log_metric_auc_2.png b/site/notebooks/images/log_metric_auc_2.png new file mode 100644 index 0000000000..8b79b09451 Binary files /dev/null and b/site/notebooks/images/log_metric_auc_2.png differ diff --git a/site/notebooks/images/log_metric_auc_3.png b/site/notebooks/images/log_metric_auc_3.png new file mode 100644 index 0000000000..84fa26ffc7 Binary files /dev/null and b/site/notebooks/images/log_metric_auc_3.png differ diff --git a/site/notebooks/images/log_metric_auc_4.png b/site/notebooks/images/log_metric_auc_4.png new file mode 100644 index 0000000000..aa1fa53265 Binary files /dev/null and b/site/notebooks/images/log_metric_auc_4.png differ diff --git a/site/notebooks/images/log_metric_f1.png b/site/notebooks/images/log_metric_f1.png new file mode 100644 index 0000000000..9e08241395 Binary files /dev/null and b/site/notebooks/images/log_metric_f1.png differ diff --git a/site/notebooks/images/log_metric_precision.png b/site/notebooks/images/log_metric_precision.png new file mode 100644 index 0000000000..946484c03b Binary files /dev/null and b/site/notebooks/images/log_metric_precision.png differ diff --git a/site/notebooks/images/log_metric_recall.png b/site/notebooks/images/log_metric_recall.png new file mode 100644 index 0000000000..1cd24a95ea Binary files /dev/null and b/site/notebooks/images/log_metric_recall.png differ diff --git a/site/python-docs.zip b/site/python-docs.zip index 54900bc708..127abb3922 100644 Binary files a/site/python-docs.zip and b/site/python-docs.zip differ diff --git a/site/releases/2024/2024-dec-24/release-notes.qmd b/site/releases/2024/2024-dec-24/release-notes.qmd index 91a9d75059..2884f518e9 100644 --- a/site/releases/2024/2024-dec-24/release-notes.qmd +++ b/site/releases/2024/2024-dec-24/release-notes.qmd @@ -2,6 +2,17 @@ title: "December 24, 2024" aliases: - ../../2024-dec-24/release-notes.html +listing: + - id: deep-dive + type: grid + grid-columns: 1 + max-description-length: 250 + # image-height: 100% + contents: + - path: https://validmind.com/blog/tech-deep-dive-custom-reports-dashboards/ + title: "Tech Deep Dive: Custom Reports & Dashboards {{< fa chevron-right >}}" + description: "For an introduction to how this feature can benefit your organization, check out our companion blog post." + fields: [title, description] --- This release brings additional customization features to the {{< var validmind.platform >}}, improvements to our documentation site, a brand new Help Center, and more! @@ -89,6 +100,20 @@ Widget types include saved inventory and model findings views[^1], analytics rep ![Custom `High Risk` dashboard showing widgets for Tier 1 models and High Severity findings](custom-dashboard.png){width=80% fig-alt="A screenshot of a custom `High Risk` dashboard showing widgets for Tier 1 models and High Severity findings" .screenshot} +:::: {.flex .flex-wrap .justify-around} + +::: {.w-50-ns .pr4} +You can add as many dashboards as you need to suit different use cases, whether it's managing model documentation and testing as a developer, tracking findings in validation reports as a validator, or monitoring analytics as a model owner. +::: + +::: {.w-50-ns .tc} +:::{#deep-dive} +::: + +::: + +:::: + ### Documentation #### User guide updates diff --git a/site/releases/2025/2025-jan-31/create-new-organization.png b/site/releases/2025/2025-jan-31/create-new-organization.png new file mode 100644 index 0000000000..0274dd6fcf Binary files /dev/null and b/site/releases/2025/2025-jan-31/create-new-organization.png differ diff --git a/site/releases/2025/2025-jan-31/dashboard-edit-mode.png b/site/releases/2025/2025-jan-31/dashboard-edit-mode.png new file mode 100644 index 0000000000..8b486d3b29 Binary files /dev/null and b/site/releases/2025/2025-jan-31/dashboard-edit-mode.png differ diff --git a/site/releases/2025/2025-jan-31/dashboard-view-mode.png b/site/releases/2025/2025-jan-31/dashboard-view-mode.png new file mode 100644 index 0000000000..7ca793f0fb Binary files /dev/null and b/site/releases/2025/2025-jan-31/dashboard-view-mode.png differ diff --git a/site/releases/2025/2025-jan-31/math-editor.png b/site/releases/2025/2025-jan-31/math-editor.png new file mode 100644 index 0000000000..22bf2f97e3 Binary files /dev/null and b/site/releases/2025/2025-jan-31/math-editor.png differ diff --git a/site/releases/2025/2025-jan-31/metric-threshold-lines.png b/site/releases/2025/2025-jan-31/metric-threshold-lines.png new file mode 100644 index 0000000000..aa1fa53265 Binary files /dev/null and b/site/releases/2025/2025-jan-31/metric-threshold-lines.png differ diff --git a/site/releases/2025/2025-jan-31/release-notes.qmd b/site/releases/2025/2025-jan-31/release-notes.qmd new file mode 100644 index 0000000000..c73e95078e --- /dev/null +++ b/site/releases/2025/2025-jan-31/release-notes.qmd @@ -0,0 +1,589 @@ +--- +title: "January 31, 2025" +listing: + - id: test-desc + type: grid + grid-columns: 1 + max-description-length: 250 + # image-height: 100% + contents: + - path: /notebooks/how_to/add_context_to_llm_descriptions.ipynb + title: "Add context to LLM-generated test descriptions" + description: "Learn how to add custom context to LLM-generated test descriptions {{< fa chevron-right >}}" + - id: credit-risk + type: grid + grid-columns: 1 + max-description-length: 250 + # image-height: 100% + contents: + - path: https://jupyterhub.validmind.ai/hub/user-redirect/lab/tree/code_samples/credit_risk/application_scorecard_with_ml.ipynb + title: "Document an application scorecard model" + categories: ["Individual Tests"] + description: "Open notebook in JupyterHub {{< fa chevron-right >}}" + - path: https://jupyterhub.validmind.ai/hub/user-redirect/lab/tree/code_samples/credit_risk/application_scorecard_full_suite.ipynb + title: "Document an application scorecard model" + categories: ["Full Test Suite"] + description: "Open notebook in JupyterHub {{< fa chevron-right >}}" + - path: https://jupyterhub.validmind.ai/hub/user-redirect/lab/tree/code_samples/credit_risk/application_scorecard_executive.ipynb + title: "Document an application scorecard model" + categories: ["Single Function"] + description: "Open notebook in JupyterHub {{< fa chevron-right >}}" + - id: ongoing-monitoring + type: grid + grid-columns: 1 + max-description-length: 250 + # image-height: 100% + contents: + - path: https://jupyterhub.validmind.ai/hub/user-redirect/lab/tree/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb) + title: "Ongoing monitoring for application scorecard" + description: "Open notebook in JupyterHub {{< fa chevron-right >}}" + - id: e2e-template + type: grid + grid-columns: 1 + max-description-length: 250 + # image-height: 100% + contents: + - path: https://github.com/validmind/validmind-library/blob/main/notebooks/templates/e2e-notebook.ipynb + title: "End-to-end notebook template generation" + description: "Open notebook on GitHub {{< fa chevron-right >}}" + fields: [title, categories, description] +--- + +This release includes our new unified versioning scheme for our software, support for thresholds in unit metrics and custom context for test descriptions within the {{< var validmind.developer >}}, and many more enhancements. + +::: {.highlights} + +## Release highlights — `25.01` + +::: {.callout} +Our documentation now follows the new **unified versioning scheme** for our software, starting with this `25.01` release. Included in this release are: + +- **{{< var validmind.developer >}} — `v2.7.7`** +- **{{< var validmind.platform >}} — `v1.29.10`** + +#### Why a unified versioning scheme? + +We manage multiple repositories, each with its own version tags. The new versioning scheme replaces the {{< var validmind.developer >}} version in the documentation to clarify that each release includes code from multiple repositories rather than a single source. + +This change simplifies tracking changes for each {{< var vm.product >}} release and streamlines version management for you. Release frequency and the upgrade process remain unchanged. +::: + +### {{< var validmind.developer >}} (v2.7.7) + + + +#### Threshold lines in unit metric plots + +When logging metrics using `log_metric()`, you can now include a `thresholds` dictionary. For example, use `thresholds={"target": 0.8, "minimum": 0.6}` to define multiple reference levels. + +![Threshold lines in unit metric plots](metric-threshold-lines.png){width="949" fig-alt="A screenshot showing threshold lines in unit metric plots" .screenshot} + + +:::: {.flex .flex-wrap .justify-around} + +::: {.w-70-ns .pr4} +- These thresholds automatically appear as horizontal reference lines when you add a *Metric Over Time* block to the documentation. + +- The visualization uses a distinct color palette to differentiate between thresholds. It displays only the most recent threshold configuration and includes threshold information in both the chart legend and data table. +::: + +::: {.w-30-ns .tr} +[log_metric()](/validmind/validmind.html#log_metric){.button .button-green target="_blank"} + +[Add metrics over time](/guide/monitoring/work-with-metrics-over-time.qmd){.button .button-green} + +::: + +:::: + +::: {.column-margin} +**Usage example:** + +```python +log_metric( + key="AUC Score", + value=auc, + recorded_at=datetime(2024, 1, 1), + thresholds={ + "high_risk": 0.6, + "medium_risk": 0.7, + "low_risk": 0.8, + } +) +``` + +::: + +This enhancement provides immediate visual context for metric values. It helps track metric performance against multiple defined thresholds over time. + + + + + +#### Add context to enhance LLM-based text generation for model test results + +You can now include contextual information to enhance LLM-based generation of test results descriptions and interpretations. This enhancement improves test result descriptions by incorporating additional context that can be specified through environment variables. + +::: {.column-margin} +:::{#test-desc} +::: + +::: + +A new notebook demonstrates adding context to LLM-based descriptions with examples of: + +- Setting up the environment +- Initializing datasets and models +- Running tests with and without context + + + + + +#### Document credit risk scorecard models using XGBoost + +We've introduced enhancements to the {{< var validmind.developer >}} that focus on documenting credit risk scorecard models: + +- **New notebooks**: Learn how to document application scorecard models using the {{< var vm.developer >}}. These notebooks provide a step-by-step guide for loading a demo dataset, preprocessing data, training models, and documenting the model. + + You can choose from three different approaches: **running individual tests**, **running a full test suite**, or **using a single function** to document a model. + +::: {.column-margin} +:::{#credit-risk} +::: + +::: + +- **New tests**: + + - [`MutualInformation`](/tests/data_validation/MutualInformation.md): Evaluates feature relevance by calculating mutual information scores between features and the target variable. + - [`ScoreBandDefaultRates`](/tests/data_validation/ScoreBandDefaultRates.md): Analyzes default rates and population distribution across credit score bands. + - [`CalibrationCurve`](/tests/model_validation/sklearn/CalibrationCurve.md): Assesses calibration by comparing predicted probabilities against observed frequencies. + - [`ClassifierThresholdOptimization`](/tests/model_validation/sklearn/ClassifierThresholdOptimization.md): Visualizes threshold optimization methods for binary classification models. + - [`ModelParameters`](/tests/model_validation/sklearn/ModelParameters.md): Extracts and displays model parameters for transparency and reproducibility. + - [`ScoreProbabilityAlignment`](/tests/model_validation/sklearn/ScoreProbabilityAlignment.md): Evaluates alignment between credit scores and predicted probabilities. + +Modifications have also been made to existing tests to improve functionality and accuracy. The [`TooManyZeroValues`](/tests/data_validation/TooManyZeroValues.md) test now includes a row count and applies a percentage threshold for zero values. + +The [`split`](/validmind/validmind/datasets/regression/lending_club.html#preprocess){target="_blank"} function in `lending_club.py` has been enhanced to support an optional validation set, allowing for more flexible dataset splitting. + +A new utility function, [`get_demo_test_config`](/validmind/validmind/datasets/credit_risk/lending_club.html#get_demo_test_config){target="_blank"}, has been added to generate a default test configuration for demo purposes. + + +#### Ongoing monitoring notebook for application scorecard model + +Several enhancements to the {{< var validmind.developer >}} focus on ongoing monitoring capabilities: + +- **New notebook**: Learn how to use ongoing monitoring with credit risk datasets in this step-by-step guide for the {{< var validmind.developer >}}. + + - Use our new metrics for data and model drift, and populate the ongoing monitoring documentation for a scorecard model.[^1] + +::: {.column-margin} +:::{#ongoing-monitoring} +::: + +::: + +- **Custom tests**: Define and run your own tests using the {{< var vm.developer >}}: + + - [`ScoreBandDiscriminationMetrics.py`](https://github.com/validmind/validmind-library/blob/main/notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py): Evaluates discrimination metrics across different score bands. + +- **New tests**: + + - [`CalibrationCurveDrift`](/tests/ongoing_monitoring/CalibrationCurveDrift.md): Evaluates changes in probability calibration. + - [`ClassDiscriminationDrift`](/tests/ongoing_monitoring/ClassDiscriminationDrift.md): Compares classification discrimination metrics. + - [`ClassImbalanceDrift`](/tests/ongoing_monitoring/ClassImbalanceDrift.md): Evaluates drift in class distribution. + - [`ClassificationAccuracyDrift`](/tests/ongoing_monitoring/ClassificationAccuracyDrift.md): Compares classification accuracy metrics. + - [`ConfusionMatrixDrift`](/tests/ongoing_monitoring/ConfusionMatrixDrift.md): Compares confusion matrix metrics. + - [`CumulativePredictionProbabilitiesDrift`](/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.md): Compares cumulative prediction probability distributions. + - [`FeatureDrift`](/tests/ongoing_monitoring/FeatureDrift.md): Evaluates changes in feature distribution. + - [`PredictionAcrossEachFeature`](/tests/ongoing_monitoring/PredictionAcrossEachFeature.md): Assesses prediction distributions across features. + - [`PredictionCorrelation`](/tests/ongoing_monitoring/PredictionCorrelation.md): Assesses correlation changes between predictions and features. + - [`PredictionProbabilitiesHistogramDrift`](/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.md): Compares prediction probability distributions. + - [`PredictionQuantilesAcrossFeatures`](/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.md): Assesses prediction distributions across features using quantiles. + - [`ROCCurveDrift`](/tests/ongoing_monitoring/ROCCurveDrift.md): Compares ROC curves. + - [`ScoreBandsDrift`](/tests/ongoing_monitoring/ScoreBandsDrift.md): Analyzes drift in score bands. + - [`ScorecardHistogramDrift`](/tests/ongoing_monitoring/ScorecardHistogramDrift.md): Compares score distributions. + - [`TargetPredictionDistributionPlot`](/tests/ongoing_monitoring/TargetPredictionDistributionPlot.md): Assesses differences in prediction distributions. + +We also improved dataset loading, preprocessing, and feature engineering functions with verbosity control for cleaner output. + + +#### Jupyter Notebook templates + +Want to create your own code samples using ValidMind's? We've now made it easier for contributors to submit custom code samples. + +:::: {.flex .flex-wrap .justify-around} + +::: {.w-50-ns .pr4} +Our **end-to-end notebook template generation notebook** will generate a new file with all the bits and pieces of a standard ValidMind notebook to get you started. + +The same functionality is also accessible from our [Makefile](https://github.com/validmind/validmind-library/blob/main/Makefile#L83C1-L85C44): + +```bash +make notebook +``` + +:::{#e2e-template} +::: + + +::: + +::: {.w-50-ns .pl4 .nt4} +##### Mini-templates + +The template generation notebook draws from a number of mini-templates, should you need to revise them or grab the information from them manually: + +- `about-validmind.ipynb`: Conceptual overview of ValidMind & prerequisites. +- `install-initialize-validmind.ipynb`: ValidMind Library installation & initialization instructions. +- `next-steps.ipynb`: Directions to review the generated documentation within the ValidMind Platform & additional learning resources. +- `upgrade-validmind.ipynb`: Instructions for comparing & upgrading versions of the ValidMind Library. +::: + +:::: + +### {{< var validmind.platform >}} (v1.29.10) + + +#### Edit your dashboards + +We’ve streamlined dashboard configuration with dedicated view and edit modes. Click **Edit Mode** to make changes, then click **Done Editing** to save and return to view mode: + +::: {.column-margin} +[Customize your dashboard](/guide/configuration/customize-your-dashboard.qmd){.button .button-green} + +::: + +![Edit mode for your dashboard](dashboard-edit-mode.png){width="809" fig-alt="A screenshot showing edit mode" .screenshot} + + + +To prevent any confusion when multiple people are working on the same dashboard, we've added some helpful safeguards: + +- If someone else makes changes while you're editing, you'll get a friendly notification to reload the page +- The system automatically detects if you're looking at an older version of the dashboard and prompts you to get the latest updates + + + + + + +#### Optional prompt for risk assessments + +Risk assessment generation has been enhanced to allow you to provide an optional prompt before starting text generation. This feature lets you guide the output, ensuring that the generated text aligns more closely with your specific requirements. + +::: {.column-margin} +[Assess compliance](/guide/model-validation/assess-compliance.qmd){.button .button-green} + +::: + +![Optional prompt for risk assessments](risk-assessment-prompt.gif){.screenshot} + +::: + +## Enhancements + +### {{< var validmind.developer >}} (v2.7.7) + + +#### Static descriptions in test results + +The `TestResult` class now exposes pre-populated test descriptions through the `doc` property, separating them from dynamically generated GenAI descriptions: + +:::: {.flex .flex-wrap .justify-around} + +::: {.w-80-ns} +- `result.doc` — contains the original docstring of the test. +- `result.description` — contains the dynamically generated description. +::: + +::: {.w-20-ns .tr} +[`TestResult`](/validmind/validmind/vm_models.html#TestResult){.button target="_blank"} +::: + +:::: + +This enhancement makes it easier to distinguish between ValidMind's standard test documentation and the dynamic, context-aware descriptions generated for your specific test results. + +:::: {.flex .flex-wrap .justify-around} + +::: {.w-70-ns} + You can browse the full catalog of official test descriptions in our test documentation: +::: + +::: {.w-30-ns .tr} +[Test descriptions](/developer/model-testing/test-descriptions.qmd){.button target="_blank"} +::: + +:::: + + +#### Raw data storage for tests + +:::: {.flex .flex-wrap .justify-around} + +::: {.w-80-ns} +We added raw data storage across all {{< var validmind.developer >}} tests. Every test now returns a `RawData` object, allowing post-processing functions to recreate any test output. This feature enhances flexibility and customizability. +::: + +::: {.w-20-ns .tr} +[`RawData`](/validmind/validmind.html#RawData){.button target="_blank"} +::: + +:::: + + +#### New `print_env` function + +:::: {.flex .flex-wrap .justify-around} + +::: {.w-80-ns} +We've added a new diagnostic `print_env()` utility function that displays comprehensive information about your running environment. This function is particularly useful when: +::: + +::: {.w-20-ns .tr} +[print_env()](/validmind/validmind.html#print_env){.button target="_blank"} +::: + +:::: +- Troubleshooting issues in your code +- Seeking support from the ValidMind team +- Verifying your environment configuration + +::: {.column-margin} + +**Usage example:** + +```python +import validmind + +validmind.print_env() +``` +::: + +This function outputs key details, such as Python version, installed package versions, and relevant environment variables, making it easier to diagnose issues and share your setup with others. + + + +### {{< var validmind.platform >}} (v1.29.10) + + +#### Simplified workflow nodes + +Workflows are now easier to read when zoomed out, helped by a larger modal window and simplified nodes: + +::: {.column-margin} +[Working with model workflows](/guide/model-workflows/working-with-model-workflows.qmd){.button} + +::: + +![Workflow visualization showing simplified nodes](workflow-simplified-nodes.png){width=90% fig-alt="A screenshot showing the simplified workflow visualization with nodes" .screenshot} + +Zooming in reveals more details: + +![Workflow visualization in zoomed-out view](workflow-zoomed-view.png){width=90% fig-alt="A screenshot showing the simplified workflow visualization" .screenshot} + +Hovering over a node highlights all `in` and `out` connections, making relationships clearer: + +![Workflow connection highlighting on hover](workflow-connection-hover.png){width=90% fig-alt="A screenshot showing the workflow connection highlighting" .screenshot} + + + + + + + + + + + + + + + + + + + + + + + + + + + + +#### New editor for mathematical formulas + +:::: {.flex .flex-wrap .justify-around} + +::: {.w-70-ns .pr4} +We replaced the plugin for the editor of mathematical equations and formulas. The new plugin provides an improved interface for adding and editing LaTeX expressions in your documentation. + +The new editor also includes a real-time preview and common mathematical symbols for easier equation creation. + +::: {.tc} +[Add mathematical formulas](/guide/model-documentation/work-with-content-blocks.html#add-mathematical-formulas.qmd){.button} +::: + +::: + +::: {.w-30-ns} +![New editor for mathematical equations and formulas](math-editor.png){fig-alt="A screenshot showing the new editor for mathematical equations and formulas" .screenshot} + +::: + +:::: + + + + + + + + + + + + + + +{{< include /releases/_how-to-upgrade.qmd >}} + + + + +[^1]: [Document credit risk scorecard models using XGBoost](#credit-risk) diff --git a/site/releases/2025/2025-jan-31/risk-assessment-prompt.gif b/site/releases/2025/2025-jan-31/risk-assessment-prompt.gif new file mode 100644 index 0000000000..f00ab448ff Binary files /dev/null and b/site/releases/2025/2025-jan-31/risk-assessment-prompt.gif differ diff --git a/site/releases/2025/2025-jan-31/workflow-connection-hover.png b/site/releases/2025/2025-jan-31/workflow-connection-hover.png new file mode 100644 index 0000000000..b697fe3b25 Binary files /dev/null and b/site/releases/2025/2025-jan-31/workflow-connection-hover.png differ diff --git a/site/releases/2025/2025-jan-31/workflow-simplified-nodes.png b/site/releases/2025/2025-jan-31/workflow-simplified-nodes.png new file mode 100644 index 0000000000..a1b911157f Binary files /dev/null and b/site/releases/2025/2025-jan-31/workflow-simplified-nodes.png differ diff --git a/site/releases/2025/2025-jan-31/workflow-zoomed-view.png b/site/releases/2025/2025-jan-31/workflow-zoomed-view.png new file mode 100644 index 0000000000..27dcae58a3 Binary files /dev/null and b/site/releases/2025/2025-jan-31/workflow-zoomed-view.png differ diff --git a/site/tests/ongoing_monitoring/CalibrationCurveDrift.md b/site/tests/ongoing_monitoring/CalibrationCurveDrift.md new file mode 100644 index 0000000000..3fa615e8df --- /dev/null +++ b/site/tests/ongoing_monitoring/CalibrationCurveDrift.md @@ -0,0 +1,46 @@ +# CalibrationCurveDrift + +Evaluates changes in probability calibration between reference and monitoring datasets. + +### Purpose + +The Calibration Curve Drift test is designed to assess changes in the model's probability calibration +over time. By comparing calibration curves between reference and monitoring datasets, this test helps +identify whether the model's probability estimates remain reliable in production. This is crucial for +understanding if the model's risk predictions maintain their intended interpretation and whether +recalibration might be necessary. + +### Test Mechanism + +This test proceeds by generating calibration curves for both reference and monitoring datasets. For each +dataset, it bins the predicted probabilities and calculates the actual fraction of positives within each +bin. It then compares these values between datasets to identify significant shifts in calibration. +The test quantifies drift as percentage changes in both mean predicted probabilities and actual fractions +of positives per bin, providing both visual and numerical assessments of calibration stability. + +### Signs of High Risk + +- Large differences between reference and monitoring calibration curves +- Systematic over-estimation or under-estimation in monitoring dataset +- Significant drift percentages exceeding the threshold in multiple bins +- Changes in calibration concentrated in specific probability ranges +- Inconsistent drift patterns across the probability spectrum +- Empty or sparse bins indicating insufficient data for reliable comparison + +### Strengths + +- Provides visual and quantitative assessment of calibration changes +- Identifies specific probability ranges where calibration has shifted +- Enables early detection of systematic prediction biases +- Includes detailed bin-by-bin comparison of calibration metrics +- Handles edge cases with insufficient data in certain bins +- Supports both binary and probabilistic interpretation of results + +### Limitations + +- Requires sufficient data in each probability bin for reliable comparison +- Sensitive to choice of number of bins and binning strategy +- May not capture complex changes in probability distributions +- Cannot directly suggest recalibration parameters +- Limited to assessing probability calibration aspects +- Results may be affected by class imbalance changes \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ClassDiscriminationDrift.md b/site/tests/ongoing_monitoring/ClassDiscriminationDrift.md new file mode 100644 index 0000000000..6a81a93961 --- /dev/null +++ b/site/tests/ongoing_monitoring/ClassDiscriminationDrift.md @@ -0,0 +1,46 @@ +# ClassDiscriminationDrift + +Compares classification discrimination metrics between reference and monitoring datasets. + +### Purpose + +The Class Discrimination Drift test is designed to evaluate changes in the model's discriminative power +over time. By comparing key discrimination metrics between reference and monitoring datasets, this test +helps identify whether the model maintains its ability to separate classes in production. This is crucial +for understanding if the model's predictive power remains stable and whether its decision boundaries +continue to effectively distinguish between different classes. + +### Test Mechanism + +This test proceeds by calculating three key discrimination metrics for both reference and monitoring +datasets: ROC AUC (Area Under the Curve), GINI coefficient, and KS (Kolmogorov-Smirnov) statistic. +For binary classification, it computes all three metrics. For multiclass problems, it focuses on +macro-averaged ROC AUC. The test quantifies drift as percentage changes in these metrics between +datasets, providing a comprehensive assessment of discrimination stability. + +### Signs of High Risk + +- Large drifts in discrimination metrics exceeding the threshold +- Significant drops in ROC AUC indicating reduced ranking ability +- Decreased GINI coefficients showing diminished separation power +- Reduced KS statistics suggesting weaker class distinction +- Inconsistent changes across different metrics +- Systematic degradation in discriminative performance + +### Strengths + +- Combines multiple complementary discrimination metrics +- Handles both binary and multiclass classification +- Provides clear quantitative drift assessment +- Enables early detection of model degradation +- Includes standardized drift threshold evaluation +- Supports comprehensive performance monitoring + +### Limitations + +- Does not identify root causes of discrimination drift +- May be sensitive to changes in class distribution +- Cannot suggest optimal decision threshold adjustments +- Limited to discrimination aspects of performance +- Requires sufficient data for reliable metric calculation +- May not capture subtle changes in decision boundaries \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ClassImbalanceDrift.md b/site/tests/ongoing_monitoring/ClassImbalanceDrift.md new file mode 100644 index 0000000000..9523239d03 --- /dev/null +++ b/site/tests/ongoing_monitoring/ClassImbalanceDrift.md @@ -0,0 +1,46 @@ +# ClassImbalanceDrift + +Evaluates drift in class distribution between reference and monitoring datasets. + +### Purpose + +The Class Imbalance Drift test is designed to detect changes in the distribution of target classes +over time. By comparing class proportions between reference and monitoring datasets, this test helps +identify whether the population structure remains stable in production. This is crucial for +understanding if the model continues to operate under similar class distribution assumptions and +whether retraining might be necessary due to significant shifts in class balance. + +### Test Mechanism + +This test proceeds by calculating class percentages for both reference and monitoring datasets. +It computes the proportion of each class and quantifies drift as the percentage difference in these +proportions between datasets. The test provides both visual and numerical comparisons of class +distributions, with special attention to changes that exceed the specified drift threshold. +Population stability is assessed on a class-by-class basis. + +### Signs of High Risk + +- Large shifts in class proportions exceeding the threshold +- Systematic changes affecting multiple classes +- Appearance of new classes or disappearance of existing ones +- Significant changes in minority class representation +- Reversal of majority-minority class relationships +- Unexpected changes in class ratios + +### Strengths + +- Provides clear visualization of distribution changes +- Identifies specific classes experiencing drift +- Enables early detection of population shifts +- Includes standardized drift threshold evaluation +- Supports both binary and multiclass problems +- Maintains interpretable percentage-based metrics + +### Limitations + +- Does not account for feature distribution changes +- Cannot identify root causes of class drift +- May be sensitive to small sample sizes +- Limited to target variable distribution only +- Requires sufficient samples per class +- May not capture subtle distribution changes \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ClassificationAccuracyDrift.md b/site/tests/ongoing_monitoring/ClassificationAccuracyDrift.md new file mode 100644 index 0000000000..a416c5d178 --- /dev/null +++ b/site/tests/ongoing_monitoring/ClassificationAccuracyDrift.md @@ -0,0 +1,46 @@ +# ClassificationAccuracyDrift + +Compares classification accuracy metrics between reference and monitoring datasets. + +### Purpose + +The Classification Accuracy Drift test is designed to evaluate changes in the model's predictive accuracy +over time. By comparing key accuracy metrics between reference and monitoring datasets, this test helps +identify whether the model maintains its performance levels in production. This is crucial for +understanding if the model's predictions remain reliable and whether its overall effectiveness has +degraded significantly. + +### Test Mechanism + +This test proceeds by calculating comprehensive accuracy metrics for both reference and monitoring +datasets. It computes overall accuracy, per-label precision, recall, and F1 scores, as well as +macro-averaged metrics. The test quantifies drift as percentage changes in these metrics between +datasets, providing both granular and aggregate views of accuracy changes. Special attention is paid +to per-label performance to identify class-specific degradation. + +### Signs of High Risk + +- Large drifts in accuracy metrics exceeding the threshold +- Inconsistent changes across different labels +- Significant drops in macro-averaged metrics +- Systematic degradation in specific class performance +- Unexpected improvements suggesting data quality issues +- Divergent trends between precision and recall + +### Strengths + +- Provides comprehensive accuracy assessment +- Identifies class-specific performance changes +- Enables early detection of model degradation +- Includes both micro and macro perspectives +- Supports multi-class classification evaluation +- Maintains interpretable drift thresholds + +### Limitations + +- May be sensitive to class distribution changes +- Does not account for prediction confidence +- Cannot identify root causes of accuracy drift +- Limited to accuracy-based metrics only +- Requires sufficient samples per class +- May not capture subtle performance changes \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ConfusionMatrixDrift.md b/site/tests/ongoing_monitoring/ConfusionMatrixDrift.md new file mode 100644 index 0000000000..cc38f3e5d1 --- /dev/null +++ b/site/tests/ongoing_monitoring/ConfusionMatrixDrift.md @@ -0,0 +1,46 @@ +# ConfusionMatrixDrift + +Compares confusion matrix metrics between reference and monitoring datasets. + +### Purpose + +The Confusion Matrix Drift test is designed to evaluate changes in the model's error patterns +over time. By comparing confusion matrix elements between reference and monitoring datasets, this +test helps identify whether the model maintains consistent prediction behavior in production. This +is crucial for understanding if the model's error patterns have shifted and whether specific types +of misclassifications have become more prevalent. + +### Test Mechanism + +This test proceeds by generating confusion matrices for both reference and monitoring datasets. +For binary classification, it tracks True Positives, True Negatives, False Positives, and False +Negatives as percentages of total predictions. For multiclass problems, it analyzes per-class +metrics including true positives and error rates. The test quantifies drift as percentage changes +in these metrics between datasets, providing detailed insight into shifting prediction patterns. + +### Signs of High Risk + +- Large drifts in confusion matrix elements exceeding threshold +- Systematic changes in false positive or false negative rates +- Inconsistent changes across different classes +- Significant shifts in error patterns for specific classes +- Unexpected improvements in certain metrics +- Divergent trends between different types of errors + +### Strengths + +- Provides detailed analysis of prediction behavior +- Identifies specific types of prediction changes +- Enables early detection of systematic errors +- Includes comprehensive error pattern analysis +- Supports both binary and multiclass problems +- Maintains interpretable percentage-based metrics + +### Limitations + +- May be sensitive to class distribution changes +- Cannot identify root causes of prediction drift +- Requires sufficient samples for reliable comparison +- Limited to hard predictions (not probabilities) +- May not capture subtle changes in decision boundaries +- Complex interpretation for multiclass problems \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.md b/site/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.md new file mode 100644 index 0000000000..415bb204cf --- /dev/null +++ b/site/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.md @@ -0,0 +1,47 @@ +# CumulativePredictionProbabilitiesDrift + +Compares cumulative prediction probability distributions between reference and monitoring datasets. + +### Purpose + +The Cumulative Prediction Probabilities Drift test is designed to evaluate changes in the model's +probability predictions over time. By comparing cumulative distribution functions of predicted +probabilities between reference and monitoring datasets, this test helps identify whether the +model's probability assignments remain stable in production. This is crucial for understanding if +the model's risk assessment behavior has shifted and whether its probability calibration remains +consistent. + +### Test Mechanism + +This test proceeds by generating cumulative distribution functions (CDFs) of predicted probabilities +for both reference and monitoring datasets. For each class, it plots the cumulative proportion of +predictions against probability values, enabling direct comparison of probability distributions. +The test visualizes both the CDFs and their differences, providing insight into how probability +assignments have shifted across the entire probability range. + +### Signs of High Risk + +- Large gaps between reference and monitoring CDFs +- Systematic shifts in probability assignments +- Concentration of differences in specific probability ranges +- Changes in the shape of probability distributions +- Unexpected patterns in cumulative differences +- Significant shifts in probability thresholds + +### Strengths + +- Provides comprehensive view of probability changes +- Identifies specific probability ranges with drift +- Enables visualization of distribution differences +- Supports analysis across multiple classes +- Maintains interpretable probability scale +- Captures subtle changes in probability assignments + +### Limitations + +- Does not provide single drift metric +- May be complex to interpret for multiple classes +- Cannot suggest probability recalibration +- Requires visual inspection for assessment +- Sensitive to sample size differences +- May not capture class-specific calibration issues \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.md b/site/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.md new file mode 100644 index 0000000000..21c585141b --- /dev/null +++ b/site/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.md @@ -0,0 +1,47 @@ +# PredictionProbabilitiesHistogramDrift + +Compares prediction probability distributions between reference and monitoring datasets. + +### Purpose + +The Prediction Probabilities Histogram Drift test is designed to evaluate changes in the model's +probability predictions over time. By comparing probability distributions between reference and +monitoring datasets using histograms, this test helps identify whether the model's probability +assignments have shifted in production. This is crucial for understanding if the model's risk +assessment behavior remains consistent and whether its probability estimates maintain their +original distribution patterns. + +### Test Mechanism + +This test proceeds by generating histograms of prediction probabilities for both reference and +monitoring datasets. For each class, it analyzes the distribution shape, central tendency, and +spread of probabilities. The test computes distribution moments (mean, variance, skewness, +kurtosis) and quantifies their drift between datasets. Visual comparison of overlaid histograms +provides immediate insight into distribution changes. + +### Signs of High Risk + +- Significant shifts in probability distribution shapes +- Large drifts in distribution moments exceeding threshold +- Appearance of new modes or peaks in monitoring data +- Changes in the spread or concentration of probabilities +- Systematic shifts in probability assignments +- Unexpected changes in distribution characteristics + +### Strengths + +- Provides intuitive visualization of probability changes +- Identifies specific changes in distribution shape +- Enables quantitative assessment of distribution drift +- Supports analysis across multiple classes +- Includes comprehensive moment analysis +- Maintains interpretable probability scale + +### Limitations + +- May be sensitive to binning choices +- Requires sufficient samples for reliable histograms +- Cannot suggest probability recalibration +- Complex interpretation for multiple classes +- May not capture subtle distribution changes +- Limited to univariate probability analysis \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.md b/site/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.md new file mode 100644 index 0000000000..36bd5ff060 --- /dev/null +++ b/site/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.md @@ -0,0 +1,34 @@ +# PredictionQuantilesAcrossFeatures + +Assesses differences in model prediction distributions across individual features between reference +and monitoring datasets through quantile analysis. + +### Purpose + +This test aims to visualize how prediction distributions vary across feature values by showing +quantile information between reference and monitoring datasets. It helps identify significant +shifts in prediction patterns and potential areas of model instability. + +### Test Mechanism + +The test generates box plots for each feature, comparing prediction probability distributions +between the reference and monitoring datasets. Each plot consists of two subplots showing the +quantile distribution of predictions: one for reference data and one for monitoring data. + +### Signs of High Risk + +- Significant differences in prediction distributions between reference and monitoring data +- Unexpected shifts in prediction quantiles across feature values +- Large changes in prediction variability between datasets + +### Strengths + +- Provides clear visualization of prediction distribution changes +- Shows outliers and variability in predictions across features +- Enables quick identification of problematic feature ranges + +### Limitations + +- May not capture complex relationships between features and predictions +- Quantile analysis may smooth over important individual predictions +- Requires careful interpretation of distribution changes \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ROCCurveDrift.md b/site/tests/ongoing_monitoring/ROCCurveDrift.md new file mode 100644 index 0000000000..8c556f72b6 --- /dev/null +++ b/site/tests/ongoing_monitoring/ROCCurveDrift.md @@ -0,0 +1,47 @@ +# ROCCurveDrift + +Compares ROC curves between reference and monitoring datasets. + +### Purpose + +The ROC Curve Drift test is designed to evaluate changes in the model's discriminative ability +over time. By comparing Receiver Operating Characteristic (ROC) curves between reference and +monitoring datasets, this test helps identify whether the model maintains its ability to +distinguish between classes across different decision thresholds. This is crucial for +understanding if the model's trade-off between sensitivity and specificity remains stable +in production. + +### Test Mechanism + +This test proceeds by generating ROC curves for both reference and monitoring datasets. For each +dataset, it plots the True Positive Rate against the False Positive Rate across all possible +classification thresholds. The test also computes AUC scores and visualizes the difference +between ROC curves, providing both graphical and numerical assessments of discrimination +stability. Special attention is paid to regions where curves diverge significantly. + +### Signs of High Risk + +- Large differences between reference and monitoring ROC curves +- Significant drop in AUC score for monitoring dataset +- Systematic differences in specific FPR regions +- Changes in optimal operating points +- Inconsistent performance across different thresholds +- Unexpected crossovers between curves + +### Strengths + +- Provides comprehensive view of discriminative ability +- Identifies specific threshold ranges with drift +- Enables visualization of performance differences +- Includes AUC comparison for overall assessment +- Supports threshold-independent evaluation +- Maintains interpretable performance metrics + +### Limitations + +- Limited to binary classification problems +- May be sensitive to class distribution changes +- Cannot suggest optimal threshold adjustments +- Requires visual inspection for detailed analysis +- Complex interpretation of curve differences +- May not capture subtle performance changes \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ScoreBandsDrift.md b/site/tests/ongoing_monitoring/ScoreBandsDrift.md new file mode 100644 index 0000000000..3f25439f8d --- /dev/null +++ b/site/tests/ongoing_monitoring/ScoreBandsDrift.md @@ -0,0 +1,48 @@ +# ScoreBandsDrift + +Analyzes drift in population distribution and default rates across score bands. + +### Purpose + +The Score Bands Drift test is designed to evaluate changes in score-based risk segmentation +over time. By comparing population distribution and default rates across score bands between +reference and monitoring datasets, this test helps identify whether the model's risk +stratification remains stable in production. This is crucial for understanding if the model's +scoring behavior maintains its intended risk separation and whether specific score ranges +have experienced significant shifts. + +### Test Mechanism + +This test proceeds by segmenting scores into predefined bands and analyzing three key metrics +across these bands: population distribution, predicted default rates, and observed default +rates. For each band, it computes these metrics for both reference and monitoring datasets +and quantifies drift as percentage changes. The test provides both detailed band-by-band +comparisons and overall stability assessment, with special attention to bands showing +significant drift. + +### Signs of High Risk + +- Large shifts in population distribution across bands +- Significant changes in default rates within bands +- Inconsistent drift patterns between adjacent bands +- Divergence between predicted and observed rates +- Systematic shifts in risk concentration +- Empty or sparse score bands in monitoring data + +### Strengths + +- Provides comprehensive view of score-based drift +- Identifies specific score ranges with instability +- Enables comparison of multiple risk metrics +- Includes both distribution and performance drift +- Supports business-relevant score segmentation +- Maintains interpretable drift thresholds + +### Limitations + +- Sensitive to choice of score band boundaries +- Requires sufficient samples in each band +- Cannot suggest optimal band adjustments +- May not capture within-band distribution changes +- Limited to predefined scoring metrics +- Complex interpretation with multiple drift signals \ No newline at end of file diff --git a/site/tests/ongoing_monitoring/ScorecardHistogramDrift.md b/site/tests/ongoing_monitoring/ScorecardHistogramDrift.md new file mode 100644 index 0000000000..95522724d1 --- /dev/null +++ b/site/tests/ongoing_monitoring/ScorecardHistogramDrift.md @@ -0,0 +1,47 @@ +# ScorecardHistogramDrift + +Compares score distributions between reference and monitoring datasets for each class. + +### Purpose + +The Scorecard Histogram Drift test is designed to evaluate changes in the model's scoring +patterns over time. By comparing score distributions between reference and monitoring datasets +for each class, this test helps identify whether the model's scoring behavior remains stable +in production. This is crucial for understanding if the model's risk assessment maintains +consistent patterns and whether specific score ranges have experienced significant shifts +in their distribution. + +### Test Mechanism + +This test proceeds by generating histograms of scores for each class in both reference and +monitoring datasets. It analyzes distribution characteristics through multiple statistical +moments: mean, variance, skewness, and kurtosis. The test quantifies drift as percentage +changes in these moments between datasets, providing both visual and numerical assessments +of distribution stability. Special attention is paid to class-specific distribution changes. + +### Signs of High Risk + +- Significant shifts in score distribution shapes +- Large drifts in distribution moments exceeding threshold +- Changes in the relative positioning of class distributions +- Appearance of new modes or peaks in monitoring data +- Unexpected changes in score spread or concentration +- Systematic shifts in class-specific scoring patterns + +### Strengths + +- Provides class-specific distribution analysis +- Identifies detailed changes in scoring patterns +- Enables visual comparison of distributions +- Includes comprehensive moment analysis +- Supports multiple class evaluation +- Maintains interpretable score scale + +### Limitations + +- Sensitive to binning choices in visualization +- Requires sufficient samples per class +- Cannot suggest score adjustments +- May not capture subtle distribution changes +- Complex interpretation with multiple classes +- Limited to univariate score analysis \ No newline at end of file