From 73262527079d267e0bcba7c1f28ddd0c200a5f6a Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Wed, 8 Jan 2025 19:37:25 +0100
Subject: [PATCH 01/14] Update ongoing monitoring tests

---
 ...ication_scorecard_ongoing_monitoring.ipynb | 577 ++++++++++++++++++
 .../tests/ongoing_monitoring/FeatureDrift.py  | 241 ++++----
 .../PredictionAcrossEachFeature.py            |  41 +-
 .../PredictionCorrelation.py                  | 131 ++--
 .../PredictionQuantilesAcrossFeatures.py      |  97 +++
 .../TargetPredictionDistributionPlot.py       | 106 +++-
 6 files changed, 990 insertions(+), 203 deletions(-)
 create mode 100644 notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
 create mode 100644 validmind/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.py
diff --git a/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb b/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
new file mode 100644
index 000000000..c0ff3bbab
--- /dev/null
+++ b/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
@@ -0,0 +1,577 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Ongoing Monitoring for Application Scorecard \n",
+    "\n",
+    "TBC."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Install the ValidMind Library\n",
+    "\n",
+    "To install the library:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#%pip install -q validmind\n",
+    "%pip install -q -e ../../../../developer-framework"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize the ValidMind Library\n",
+    "\n",
+    "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n",
+    "\n",
+    "<a id='toc3_1_'></a>\n",
+    "\n",
+    "### Get your code snippet\n",
+    "\n",
+    "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n",
+    "\n",
+    "2. In the left sidebar, navigate to **Model Inventory** and click **+ Register Model**.\n",
+    "\n",
+    "3. Enter the model details and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n",
+    "\n",
+    "   For example, to register a model for use with this notebook, select:\n",
+    "\n",
+    "   - Documentation template: `Binary classification`\n",
+    "   - Use case: `Marketing/Sales - Attrition/Churn Management`\n",
+    "\n",
+    "   You can fill in other options according to your preference.\n",
+    "\n",
+    "4. Go to **Getting Started** and click **Copy snippet to clipboard**.\n",
+    "\n",
+    "Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load your model identifier credentials from an `.env` file\n",
+    "\n",
+    "%load_ext dotenv\n",
+    "%dotenv .env\n",
+    "\n",
+    "# Or replace with your code snippet\n",
+    "\n",
+    "import validmind as vm\n",
+    "\n",
+    "vm.init(\n",
+    "  api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n",
+    "  api_key = \"f3e49f241081145facbbf59e93bcd8a9\",\n",
+    "  api_secret = \"c8dae73c5cc063cd070fa19508e625f60fe6dd18dddf96afed0d932ded91f530\",\n",
+    "  model = \"cm5gljv9100021nignfpbkvvc\",\n",
+    "  monitoring = True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize the Python environment\n",
+    "\n",
+    "Next, let's import the necessary libraries and set up your Python environment for data analysis:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xgboost as xgb\n",
+    "\n",
+    "from validmind.tests import run_test\n",
+    "from validmind.datasets.credit_risk import lending_club\n",
+    "\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Preview the monitoring template\n",
+    "\n",
+    "A template predefines sections for your model monitoring documentation and provides a general outline to follow, making the documentation process much easier.\n",
+    "\n",
+    "You will upload documentation and test results into this template later on. For now, take a look at the structure that the template provides with the `vm.preview_template()` function from the ValidMind library and note the empty sections:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vm.preview_template()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load the reference and monitoring datasets\n",
+    "\n",
+    "The sample dataset used here is provided by the ValidMind library. For demonstration purposes we'll use the training, test dataset splits as `reference` and `monitoring` datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = lending_club.load_data(source=\"offline\")\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preprocess_df = lending_club.preprocess(df)\n",
+    "preprocess_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fe_df = lending_club.feature_engineering(preprocess_df)\n",
+    "fe_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train the model\n",
+    "\n",
+    "In this section, we focus on constructing and refining our predictive model. \n",
+    "- We begin by dividing our data, which is based on Weight of Evidence (WoE) features, into training and testing sets (`train_df`, `test_df`). \n",
+    "- With `lending_club.split`, we employ a simple random split, randomly allocating data points to each set to ensure a mix of examples in both."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Split the data\n",
+    "train_df, test_df = lending_club.split(fe_df, test_size=0.2)\n",
+    "\n",
+    "x_train = train_df.drop(lending_club.target_column, axis=1)\n",
+    "y_train = train_df[lending_club.target_column]\n",
+    "\n",
+    "x_test = test_df.drop(lending_club.target_column, axis=1)\n",
+    "y_test = test_df[lending_club.target_column]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the XGBoost model\n",
+    "xgb_model = xgb.XGBClassifier(\n",
+    "    n_estimators=50, \n",
+    "    random_state=42, \n",
+    "    early_stopping_rounds=10\n",
+    ")\n",
+    "xgb_model.set_params(\n",
+    "    eval_metric=[\"error\", \"logloss\", \"auc\"],\n",
+    ")\n",
+    "\n",
+    "# Fit the model\n",
+    "xgb_model.fit(\n",
+    "    x_train, \n",
+    "    y_train,\n",
+    "    eval_set=[(x_test, y_test)],\n",
+    "    verbose=False\n",
+    ")\n",
+    "\n",
+    "# Compute probabilities\n",
+    "train_xgb_prob = xgb_model.predict_proba(x_train)[:, 1]\n",
+    "test_xgb_prob = xgb_model.predict_proba(x_test)[:, 1]\n",
+    "\n",
+    "# Compute binary predictions\n",
+    "cut_off_threshold = 0.3\n",
+    "train_xgb_binary_predictions = (train_xgb_prob > cut_off_threshold).astype(int)\n",
+    "test_xgb_binary_predictions = (test_xgb_prob > cut_off_threshold).astype(int)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialize the ValidMind datasets\n",
+    "\n",
+    "Before you can run tests, you must first initialize a ValidMind dataset object using the [`init_dataset`](https://docs.validmind.ai/validmind/validmind.html#init_dataset) function from the ValidMind (`vm`) module.\n",
+    "\n",
+    "This function takes a number of arguments:\n",
+    "\n",
+    "- `dataset` — The raw dataset that you want to provide as input to tests.\n",
+    "- `input_id` - A unique identifier that allows tracking what inputs are used when running each individual test.\n",
+    "- `target_column` — A required argument if tests require access to true values. This is the name of the target column in the dataset.\n",
+    "\n",
+    "With all datasets ready, you can now initialize training, reference(test) and monitor datasets (`reference_df` and `monitor_df`) created earlier into their own dataset objects using [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vm_raw_dataset = vm.init_dataset(\n",
+    "    dataset=df,\n",
+    "    input_id=\"raw_dataset\",\n",
+    "    target_column=lending_club.target_column,\n",
+    ")\n",
+    "\n",
+    "vm_preprocess_dataset = vm.init_dataset(\n",
+    "    dataset=preprocess_df,\n",
+    "    input_id=\"preprocess_dataset\",\n",
+    "    target_column=lending_club.target_column,\n",
+    ")\n",
+    "\n",
+    "vm_fe_dataset = vm.init_dataset(\n",
+    "    dataset=fe_df,\n",
+    "    input_id=\"fe_dataset\",\n",
+    "    target_column=lending_club.target_column,\n",
+    ")\n",
+    "\n",
+    "vm_reference_ds = vm.init_dataset(\n",
+    "    dataset=train_df,\n",
+    "    input_id=\"reference_dataset\",\n",
+    "    target_column=lending_club.target_column,\n",
+    ")\n",
+    "\n",
+    "vm_monitoring_ds = vm.init_dataset(\n",
+    "    dataset=test_df,\n",
+    "    input_id=\"monitoring_dataset\",\n",
+    "    target_column=lending_club.target_column,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialize a model object\n",
+    "\n",
+    "You will also need to initialize a ValidMind model object (`vm_model`) that can be passed to other functions for analysis and tests on the data. You simply intialize this model object with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vm_xgb_model = vm.init_model(\n",
+    "    xgb_model,\n",
+    "    input_id=\"xgb_model\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Assign prediction values and probabilities to the datasets\n",
+    "\n",
+    "With our model now trained, we'll move on to assigning both the predictive probabilities coming directly from the model's predictions, and the binary prediction after applying the cutoff threshold described in the previous steps. \n",
+    "- These tasks are achieved through the use of the `assign_predictions()` method associated with the VM `dataset` object.\n",
+    "- This method links the model's class prediction values and probabilities to our VM train and test datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vm_reference_ds.assign_predictions(\n",
+    "    model=vm_xgb_model,\n",
+    "    prediction_values=train_xgb_binary_predictions,\n",
+    "    prediction_probabilities=train_xgb_prob,\n",
+    ")\n",
+    "\n",
+    "vm_monitoring_ds.assign_predictions(\n",
+    "    model=vm_xgb_model,\n",
+    "    prediction_values=test_xgb_binary_predictions,\n",
+    "    prediction_probabilities=test_xgb_prob,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Compute credit risk scores\n",
+    "\n",
+    "In this phase, we translate model predictions into actionable scores using probability estimates generated by our trained model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_xgb_scores = lending_club.compute_scores(train_xgb_prob)\n",
+    "test_xgb_scores = lending_club.compute_scores(test_xgb_prob)\n",
+    "\n",
+    "# Assign scores to the datasets\n",
+    "vm_reference_ds.add_extra_column(\"xgb_scores\", train_xgb_scores)\n",
+    "vm_monitoring_ds.add_extra_column(\"xgb_scores\", test_xgb_scores)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Adding custom context to the LLM descriptions\n",
+    "\n",
+    "To enable the LLM descriptions context, you need to set the `VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED` environment variable to `1`. This will enable the LLM descriptions context, which will be used to provide additional context to the LLM descriptions. This is a global setting that will affect all tests."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED\"] = \"1\"\n",
+    "\n",
+    "context = \"\"\"\n",
+    "FORMAT FOR THE LLM DESCRIPTIONS: \n",
+    "    **<Test Name>** is designed to <begin with a concise overview of what the test does and its primary purpose, \n",
+    "    extracted from the test description>.\n",
+    "\n",
+    "    The test operates by <write a paragraph about the test mechanism, explaining how it works and what it measures. \n",
+    "    Include any relevant formulas or methodologies mentioned in the test description.>\n",
+    "\n",
+    "    The primary advantages of this test include <write a paragraph about the test's strengths and capabilities, \n",
+    "    highlighting what makes it particularly useful for specific scenarios.>\n",
+    "\n",
+    "    Users should be aware that <write a paragraph about the test's limitations and potential risks. \n",
+    "    Include both technical limitations and interpretation challenges. \n",
+    "    If the test description includes specific signs of high risk, incorporate these here.>\n",
+    "\n",
+    "    **Key Insights:**\n",
+    "\n",
+    "    The test results reveal:\n",
+    "\n",
+    "    - **<insight title>**: <comprehensive description of one aspect of the results>\n",
+    "    - **<insight title>**: <comprehensive description of another aspect>\n",
+    "    ...\n",
+    "\n",
+    "    Based on these results, <conclude with a brief paragraph that ties together the test results with the test's \n",
+    "    purpose and provides any final recommendations or considerations.>\n",
+    "\n",
+    "ADDITIONAL INSTRUCTIONS:\n",
+    "    Present insights in order from general to specific, with each insight as a single bullet point with bold title.\n",
+    "\n",
+    "    For each metric in the test results, include in the test overview:\n",
+    "    - The metric's purpose and what it measures\n",
+    "    - Its mathematical formula in LaTeX notation\n",
+    "    - The range of possible values\n",
+    "    - What constitutes good/bad performance\n",
+    "    - How to interpret different values\n",
+    "\n",
+    "    Each insight should progressively cover:\n",
+    "    1. Overall scope and distribution\n",
+    "    2. Complete breakdown of all elements with specific values\n",
+    "    3. Natural groupings and patterns\n",
+    "    4. Comparative analysis between datasets/categories\n",
+    "    5. Stability and variations\n",
+    "    6. Notable relationships or dependencies\n",
+    "\n",
+    "    Remember:\n",
+    "    - Keep all insights at the same level (no sub-bullets or nested structures)\n",
+    "    - Make each insight complete and self-contained\n",
+    "    - Include specific numerical values and ranges\n",
+    "    - Cover all elements in the results comprehensively\n",
+    "    - Maintain clear, concise language\n",
+    "    - Use only \"- **Title**: Description\" format for insights\n",
+    "    - Progress naturally from general to specific observations\n",
+    "\n",
+    "\"\"\".strip()\n",
+    "\n",
+    "os.environ[\"VALIDMIND_LLM_DESCRIPTIONS_CONTEXT\"] = context"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Conduct target and feature drift testing\n",
+    "\n",
+    "Next, the goal is to investigate the distributional characteristics of predictions and features to determine if the underlying data has changed. These tests are crucial for assessing the expected accuracy of the model.\n",
+    "\n",
+    "1. **Target drift:** We compare the dataset used for testing (reference data) with the monitoring data. This helps to identify any shifts in the target variable distribution.\n",
+    "2. **Feature drift:** We compare the training dataset with the monitoring data. Since features were used to train the model, any drift in these features could indicate potential issues, as the underlying patterns that the model was trained on may have changed."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we can examine the correlation between features and predictions. Significant changes in these correlations may trigger a deeper assessment."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=True\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.TargetPredictionDistributionPlot\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        },\n",
+    "        params={\n",
+    "            \"drift_pct_threshold\": 5\n",
+    "        },\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we want see difference in correlation pairs between model prediction and features."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=True\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.PredictionCorrelation\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        },\n",
+    "        params={\n",
+    "            \"drift_pct_threshold\": 5\n",
+    "        },\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally for target drift, let's plot each prediction value and feature grid side by side."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=True\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.PredictionQuantilesAcrossFeatures\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        },\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Feature drift tests\n",
+    "\n",
+    "Next, let's add run a test to investigate how or if the features have drifted. In this instance we want to compare the training data with prediction data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=True\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.FeatureDrift\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        },\n",
+    "        params={\n",
+    "            \"psi_threshold\": 0.2,\n",
+    "        },\n",
+    "    )\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "validmind-eEL8LtKG-py3.10",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/validmind/tests/ongoing_monitoring/FeatureDrift.py b/validmind/tests/ongoing_monitoring/FeatureDrift.py
index 771e2a186..c864903e6 100644
--- a/validmind/tests/ongoing_monitoring/FeatureDrift.py
+++ b/validmind/tests/ongoing_monitoring/FeatureDrift.py
@@ -2,18 +2,99 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
-
-import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-
+import plotly.graph_objects as go
 from validmind import tags, tasks
 
 
+def calculate_psi_score(actual, expected):
+    """Calculate PSI score for a single bucket."""
+    return (actual - expected) * np.log((actual + 1e-6) / (expected + 1e-6))
+
+
+def calculate_feature_distributions(
+    reference_data, monitoring_data, feature_columns, bins
+):
+    """Calculate population distributions for each feature."""
+    # Calculate quantiles from reference data
+    quantiles = reference_data[feature_columns].quantile(
+        bins, method="single", interpolation="nearest"
+    )
+
+    distributions = {}
+    for dataset_name, data in [
+        ("reference", reference_data),
+        ("monitoring", monitoring_data),
+    ]:
+        for feature in feature_columns:
+            for bin_idx, threshold in enumerate(quantiles[feature]):
+                if bin_idx == 0:
+                    mask = data[feature] < threshold
+                else:
+                    prev_threshold = quantiles[feature][bins[bin_idx - 1]]
+                    mask = (data[feature] >= prev_threshold) & (
+                        data[feature] < threshold
+                    )
+
+                count = mask.sum()
+                proportion = count / len(data)
+                distributions[(dataset_name, feature, bins[bin_idx])] = proportion
+
+    return distributions
+
+
+def create_distribution_plot(feature_name, reference_dist, monitoring_dist, bins):
+    """Create population distribution plot for a feature."""
+    fig = go.Figure()
+
+    # Add reference distribution
+    fig.add_trace(
+        go.Bar(
+            x=list(range(len(bins))),
+            y=reference_dist,
+            name="Reference",
+            marker_color="blue",
+            marker_line_color="black",
+            marker_line_width=1,
+            opacity=0.75,
+        )
+    )
+
+    # Add monitoring distribution
+    fig.add_trace(
+        go.Bar(
+            x=list(range(len(bins))),
+            y=monitoring_dist,
+            name="Monitoring",
+            marker_color="green",
+            marker_line_color="black",
+            marker_line_width=1,
+            opacity=0.75,
+        )
+    )
+
+    fig.update_layout(
+        title=f"Population Distribution: {feature_name}",
+        xaxis_title="Bin",
+        yaxis_title="Population %",
+        barmode="group",
+        template="plotly_white",
+        showlegend=True,
+        width=800,
+        height=400,
+    )
+
+    return fig
+
+
 @tags("visualization")
 @tasks("monitoring")
 def FeatureDrift(
-    datasets, bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], feature_columns=None
+    datasets,
+    bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
+    feature_columns=None,
+    psi_threshold=0.2,
 ):
     """
     Evaluates changes in feature distribution over time to identify potential model drift.
@@ -57,130 +138,48 @@ def FeatureDrift(
     - PSI score interpretation can be overly simplistic for complex datasets.
     """
 
-    # Feature columns for both datasets should be the same if not given
-    default_feature_columns = datasets[0].feature_columns
-    feature_columns = feature_columns or default_feature_columns
+    # Get feature columns
+    feature_columns = feature_columns or datasets[0].feature_columns
 
-    x_train_df = datasets[0].x_df()
-    x_test_df = datasets[1].x_df()
+    # Get data
+    reference_data = datasets[0].df
+    monitoring_data = datasets[1].df
 
-    quantiles_train = x_train_df[feature_columns].quantile(
-        bins, method="single", interpolation="nearest"
-    )
-    PSI_QUANTILES = quantiles_train.to_dict()
-
-    PSI_BUCKET_FRAC, col, n = get_psi_buckets(
-        x_test_df, x_train_df, feature_columns, bins, PSI_QUANTILES
+    # Calculate distributions
+    distributions = calculate_feature_distributions(
+        reference_data, monitoring_data, feature_columns, bins
     )
 
-    def nest(d: dict) -> dict:
-        result = {}
-        for key, value in d.items():
-            target = result
-            for k in key[:-1]:  # traverse all keys but the last
-                target = target.setdefault(k, {})
-            target[key[-1]] = value
-        return result
-
-    PSI_BUCKET_FRAC = nest(PSI_BUCKET_FRAC)
-
-    PSI_SCORES = {}
-    for col in feature_columns:
+    # Calculate PSI scores
+    psi_scores = {}
+    for feature in feature_columns:
         psi = 0
-        for n in bins:
-            actual = PSI_BUCKET_FRAC["test"][col][n]
-            expected = PSI_BUCKET_FRAC["train"][col][n]
-            psi_of_bucket = (actual - expected) * np.log(
-                (actual + 1e-6) / (expected + 1e-6)
-            )
-            psi += psi_of_bucket
-        PSI_SCORES[col] = psi
-
-    psi_df = pd.DataFrame(list(PSI_SCORES.items()), columns=["Features", "PSI Score"])
+        for bin_val in bins:
+            reference_prop = distributions[("reference", feature, bin_val)]
+            monitoring_prop = distributions[("monitoring", feature, bin_val)]
+            psi += calculate_psi_score(monitoring_prop, reference_prop)
+        psi_scores[feature] = psi
+
+    # Create PSI score dataframe
+    psi_df = pd.DataFrame(list(psi_scores.items()), columns=["Feature", "PSI Score"])
+
+    # Add Pass/Fail column
+    psi_df["Pass/Fail"] = psi_df["PSI Score"].apply(
+        lambda x: "Pass" if x < psi_threshold else "Fail"
+    )
 
+    # Sort by PSI Score
     psi_df.sort_values(by=["PSI Score"], inplace=True, ascending=False)
 
-    psi_table = [
-        {"Features": values["Features"], "PSI Score": values["PSI Score"]}
-        for i, values in enumerate(psi_df.to_dict(orient="records"))
-    ]
-
-    save_fig = plot_hist(PSI_BUCKET_FRAC, bins)
-
-    final_psi = pd.DataFrame(psi_table)
-
-    return (final_psi, *save_fig)
-
-
-def get_psi_buckets(x_test_df, x_train_df, feature_columns, bins, PSI_QUANTILES):
-    DATA = {"test": x_test_df, "train": x_train_df}
-    PSI_BUCKET_FRAC = {}
-    for table in DATA.keys():
-        total_count = DATA[table].shape[0]
-        for col in feature_columns:
-            count_sum = 0
-            for n in bins:
-                if n == 0:
-                    bucket_count = (DATA[table][col] < PSI_QUANTILES[col][n]).sum()
-                elif n < 9:
-                    bucket_count = (
-                        total_count
-                        - count_sum
-                        - ((DATA[table][col] >= PSI_QUANTILES[col][n]).sum())
-                    )
-                elif n == 9:
-                    bucket_count = total_count - count_sum
-                count_sum += bucket_count
-                PSI_BUCKET_FRAC[table, col, n] = bucket_count / total_count
-    return PSI_BUCKET_FRAC, col, n
-
-
-def plot_hist(PSI_BUCKET_FRAC, bins):
-    bin_table_psi = pd.DataFrame(PSI_BUCKET_FRAC)
-    save_fig = []
-    for i in range(len(bin_table_psi)):
+    # Create distribution plots
+    figures = []
+    for feature in feature_columns:
+        reference_dist = [distributions[("reference", feature, b)] for b in bins]
+        monitoring_dist = [distributions[("monitoring", feature, b)] for b in bins]
+        fig = create_distribution_plot(feature, reference_dist, monitoring_dist, bins)
+        figures.append(fig)
 
-        x = pd.DataFrame(
-            bin_table_psi.iloc[i]["test"].items(),
-            columns=["Bin", "Population % Reference"],
-        )
-        y = pd.DataFrame(
-            bin_table_psi.iloc[i]["train"].items(),
-            columns=["Bin", "Population % Monitoring"],
-        )
-        xy = x.merge(y, on="Bin")
-        xy.index = xy["Bin"]
-        xy = xy.drop(columns="Bin", axis=1)
-        feature_name = bin_table_psi.index[i]
-
-        n = len(bins)
-        r = np.arange(n)
-        width = 0.25
-
-        fig = plt.figure()
-
-        plt.bar(
-            r,
-            xy["Population % Reference"],
-            color="b",
-            width=width,
-            edgecolor="black",
-            label="Reference {0}".format(feature_name),
-        )
-        plt.bar(
-            r + width,
-            xy["Population % Monitoring"],
-            color="g",
-            width=width,
-            edgecolor="black",
-            label="Monitoring {0}".format(feature_name),
-        )
+    # Calculate overall pass/fail
+    pass_fail_bool = (psi_df["Pass/Fail"] == "Pass").all()
 
-        plt.xlabel("Bin")
-        plt.ylabel("Population %")
-        plt.title("Histogram of Population Differences {0}".format(feature_name))
-        plt.legend()
-        plt.tight_layout()
-        plt.close()
-        save_fig.append(fig)
-    return save_fig
+    return ({"PSI Scores": psi_df}, *figures, pass_fail_bool)
diff --git a/validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py b/validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py
index 87f059e75..29c68b194 100644
--- a/validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py
+++ b/validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py
@@ -53,30 +53,25 @@ def PredictionAcrossEachFeature(datasets, model):
     observed during the training of the model.
     """
 
-    df_reference = datasets[0]._df
-    df_monitoring = datasets[1]._df
+    y_prob_reference = datasets[0].y_prob(model)
+    y_prob_monitoring = datasets[1].y_prob(model)
 
     figures_to_save = []
-    for column in df_reference:
-        prediction_prob_column = f"{model.input_id}_probabilities"
-        prediction_column = f"{model.input_id}_prediction"
-        if column == prediction_prob_column or column == prediction_column:
-            pass
-        else:
-            fig, axs = plt.subplots(1, 2, figsize=(20, 10), sharey="row")
-
-            ax1, ax2 = axs
-
-            ax1.scatter(df_reference[column], df_reference[prediction_prob_column])
-            ax2.scatter(df_monitoring[column], df_monitoring[prediction_prob_column])
-
-            ax1.set_title("Reference")
-            ax1.set_xlabel(column)
-            ax1.set_ylabel("Prediction Value")
-
-            ax2.set_title("Monitoring")
-            ax2.set_xlabel(column)
-            figures_to_save.append(fig)
-            plt.close()
+    for column in datasets[0].feature_columns:
+        fig, axs = plt.subplots(1, 2, figsize=(20, 10), sharey="row")
+
+        ax1, ax2 = axs
+
+        ax1.scatter(datasets[0].df[column], y_prob_reference)
+        ax2.scatter(datasets[1].df[column], y_prob_monitoring)
+
+        ax1.set_title("Reference")
+        ax1.set_xlabel(column)
+        ax1.set_ylabel("Prediction Value")
+
+        ax2.set_title("Monitoring")
+        ax2.set_xlabel(column)
+        figures_to_save.append(fig)
+        plt.close()
 
     return tuple(figures_to_save)
diff --git a/validmind/tests/ongoing_monitoring/PredictionCorrelation.py b/validmind/tests/ongoing_monitoring/PredictionCorrelation.py
index 547425c33..f8f366ba3 100644
--- a/validmind/tests/ongoing_monitoring/PredictionCorrelation.py
+++ b/validmind/tests/ongoing_monitoring/PredictionCorrelation.py
@@ -2,16 +2,14 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
-
-import matplotlib.pyplot as plt
-import numpy as np
-
+import pandas as pd
+import plotly.graph_objects as go
 from validmind import tags, tasks
 
 
 @tags("visualization")
 @tasks("monitoring")
-def PredictionCorrelation(datasets, model):
+def PredictionCorrelation(datasets, model, drift_pct_threshold=20):
     """
     Assesses correlation changes between model predictions from reference and monitoring datasets to detect potential
     target drift.
@@ -47,55 +45,98 @@ def PredictionCorrelation(datasets, model):
     - Focuses solely on linear relationships, potentially missing non-linear interactions.
     """
 
-    prediction_prob_column = f"{model.input_id}_probabilities"
-    prediction_column = f"{model.input_id}_prediction"
+    # Get feature columns and predictions
+    feature_columns = datasets[0].feature_columns
+    y_prob_ref = pd.Series(datasets[0].y_prob(model), index=datasets[0].df.index)
+    y_prob_mon = pd.Series(datasets[1].y_prob(model), index=datasets[1].df.index)
 
-    df_corr = datasets[0]._df.corr()
-    df_corr = df_corr[[prediction_prob_column]]
+    # Create dataframes with features and predictions
+    df_ref = datasets[0].df[feature_columns].copy()
+    df_ref["predictions"] = y_prob_ref
 
-    df_corr2 = datasets[1]._df.corr()
-    df_corr2 = df_corr2[[prediction_prob_column]]
+    df_mon = datasets[1].df[feature_columns].copy()
+    df_mon["predictions"] = y_prob_mon
 
-    corr_final = df_corr.merge(df_corr2, left_index=True, right_index=True)
-    corr_final.columns = ["Reference Predictions", "Monitoring Predictions"]
-    corr_final = corr_final.drop(index=[prediction_column, prediction_prob_column])
+    # Calculate correlations
+    corr_ref = df_ref.corr()["predictions"]
+    corr_mon = df_mon.corr()["predictions"]
 
-    n = len(corr_final)
-    r = np.arange(n)
-    width = 0.25
+    # Combine correlations (excluding the predictions row)
+    corr_final = pd.DataFrame(
+        {
+            "Reference Predictions": corr_ref[feature_columns],
+            "Monitoring Predictions": corr_mon[feature_columns],
+        }
+    )
 
-    fig = plt.figure()
+    # Calculate drift percentage with direction
+    corr_final["Drift (%)"] = (
+        (corr_final["Monitoring Predictions"] - corr_final["Reference Predictions"])
+        / corr_final["Reference Predictions"].abs()
+        * 100
+    ).round(2)
+
+    # Add Pass/Fail column based on absolute drift
+    corr_final["Pass/Fail"] = (
+        corr_final["Drift (%)"]
+        .abs()
+        .apply(lambda x: "Pass" if x < drift_pct_threshold else "Fail")
+    )
 
-    plt.bar(
-        r,
-        corr_final["Reference Predictions"],
-        color="b",
-        width=width,
-        edgecolor="black",
-        label="Reference Prediction Correlation",
+    # Create plotly figure
+    fig = go.Figure()
+
+    # Add reference predictions bar
+    fig.add_trace(
+        go.Bar(
+            name="Reference Prediction Correlation",
+            x=corr_final.index,
+            y=corr_final["Reference Predictions"],
+            marker_color="blue",
+            marker_line_color="black",
+            marker_line_width=1,
+            opacity=0.75,
+        )
     )
-    plt.bar(
-        r + width,
-        corr_final["Monitoring Predictions"],
-        color="g",
-        width=width,
-        edgecolor="black",
-        label="Monitoring Prediction Correlation",
+
+    # Add monitoring predictions bar
+    fig.add_trace(
+        go.Bar(
+            name="Monitoring Prediction Correlation",
+            x=corr_final.index,
+            y=corr_final["Monitoring Predictions"],
+            marker_color="green",
+            marker_line_color="black",
+            marker_line_width=1,
+            opacity=0.75,
+        )
     )
 
-    plt.xlabel("Features")
-    plt.ylabel("Correlation")
-    plt.title("Correlation between Predictions and Features")
+    # Update layout
+    fig.update_layout(
+        title="Correlation between Predictions and Features",
+        xaxis_title="Features",
+        yaxis_title="Correlation",
+        barmode="group",
+        template="plotly_white",
+        showlegend=True,
+        xaxis_tickangle=-45,
+        yaxis=dict(
+            range=[-1, 1],  # Correlation range is always -1 to 1
+            zeroline=True,
+            zerolinewidth=1,
+            zerolinecolor="grey",
+            gridcolor="lightgrey",
+        ),
+        hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
+    )
 
-    features = corr_final.index.to_list()
-    plt.xticks(r + width / 2, features, rotation=45)
-    plt.legend()
-    plt.tight_layout()
+    # Ensure Features is the first column
+    corr_final["Feature"] = corr_final.index
+    cols = ["Feature"] + [col for col in corr_final.columns if col != "Feature"]
+    corr_final = corr_final[cols]
 
-    plt.close()
+    # Calculate overall pass/fail
+    pass_fail_bool = (corr_final["Pass/Fail"] == "Pass").all()
 
-    corr_final["Features"] = corr_final.index
-    corr_final = corr_final[
-        ["Features", "Reference Predictions", "Monitoring Predictions"]
-    ]
-    return ({"Correlation Pair Table": corr_final}, fig)
+    return ({"Correlation Pair Table": corr_final}, fig, pass_fail_bool)
diff --git a/validmind/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.py b/validmind/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.py
new file mode 100644
index 000000000..ecd3dd926
--- /dev/null
+++ b/validmind/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.py
@@ -0,0 +1,97 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+from validmind import tags, tasks
+
+
+@tags("visualization")
+@tasks("monitoring")
+def PredictionQuantilesAcrossFeatures(datasets, model):
+    """
+    Assesses differences in model prediction distributions across individual features between reference
+    and monitoring datasets through quantile analysis.
+
+    ### Purpose
+
+    This test aims to visualize how prediction distributions vary across feature values by showing
+    quantile information between reference and monitoring datasets. It helps identify significant
+    shifts in prediction patterns and potential areas of model instability.
+
+    ### Test Mechanism
+
+    The test generates box plots for each feature, comparing prediction probability distributions
+    between the reference and monitoring datasets. Each plot consists of two subplots showing the
+    quantile distribution of predictions: one for reference data and one for monitoring data.
+
+    ### Signs of High Risk
+
+    - Significant differences in prediction distributions between reference and monitoring data
+    - Unexpected shifts in prediction quantiles across feature values
+    - Large changes in prediction variability between datasets
+
+    ### Strengths
+
+    - Provides clear visualization of prediction distribution changes
+    - Shows outliers and variability in predictions across features
+    - Enables quick identification of problematic feature ranges
+
+    ### Limitations
+
+    - May not capture complex relationships between features and predictions
+    - Quantile analysis may smooth over important individual predictions
+    - Requires careful interpretation of distribution changes
+    """
+
+    feature_columns = datasets[0].feature_columns
+    y_prob_reference = datasets[0].y_prob(model)
+    y_prob_monitoring = datasets[1].y_prob(model)
+
+    figures_to_save = []
+    for column in feature_columns:
+        # Create subplot
+        fig = make_subplots(1, 2, subplot_titles=("Reference", "Monitoring"))
+
+        # Add reference box plot
+        fig.add_trace(
+            go.Box(
+                x=datasets[0].df[column],
+                y=y_prob_reference,
+                name="Reference",
+                boxpoints="outliers",
+                marker_color="blue",
+            ),
+            row=1,
+            col=1,
+        )
+
+        # Add monitoring box plot
+        fig.add_trace(
+            go.Box(
+                x=datasets[1].df[column],
+                y=y_prob_monitoring,
+                name="Monitoring",
+                boxpoints="outliers",
+                marker_color="red",
+            ),
+            row=1,
+            col=2,
+        )
+
+        # Update layout
+        fig.update_layout(
+            title=f"Prediction Distributions vs {column}",
+            showlegend=False,
+            width=800,
+            height=400,
+        )
+
+        # Update axes
+        fig.update_xaxes(title=column)
+        fig.update_yaxes(title="Prediction Value")
+
+        figures_to_save.append(fig)
+
+    return tuple(figures_to_save)
diff --git a/validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py b/validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py
index e57f9302d..f99bf97e2 100644
--- a/validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py
+++ b/validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py
@@ -2,15 +2,17 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
-import matplotlib.pyplot as plt
-import seaborn as sns
-
+import plotly.graph_objects as go
+import plotly.figure_factory as ff
+import pandas as pd
+import numpy as np
+from scipy.stats import skew, kurtosis
 from validmind import tags, tasks
 
 
 @tags("visualization")
 @tasks("monitoring")
-def TargetPredictionDistributionPlot(datasets, model):
+def TargetPredictionDistributionPlot(datasets, model, drift_pct_threshold=20):
     """
     Assesses differences in prediction distributions between a reference dataset and a monitoring dataset to identify
     potential data drift.
@@ -45,23 +47,99 @@ def TargetPredictionDistributionPlot(datasets, model):
     - Less effective if the differences in distributions are subtle and not easily visible.
     """
 
+    # Get predictions
     pred_ref = datasets[0].y_prob_df(model)
     pred_ref.columns = ["Reference Prediction"]
     pred_monitor = datasets[1].y_prob_df(model)
     pred_monitor.columns = ["Monitoring Prediction"]
 
-    fig = plt.figure()
-    plot = sns.kdeplot(
-        pred_ref["Reference Prediction"], fill=True, label="Reference Prediction"
+    # Calculate distribution moments
+    moments = pd.DataFrame(
+        {
+            "Statistic": ["Mean", "Std", "Skewness", "Kurtosis"],
+            "Reference": [
+                pred_ref["Reference Prediction"].mean(),
+                pred_ref["Reference Prediction"].std(),
+                skew(pred_ref["Reference Prediction"]),
+                kurtosis(pred_ref["Reference Prediction"]),
+            ],
+            "Monitoring": [
+                pred_monitor["Monitoring Prediction"].mean(),
+                pred_monitor["Monitoring Prediction"].std(),
+                skew(pred_monitor["Monitoring Prediction"]),
+                kurtosis(pred_monitor["Monitoring Prediction"]),
+            ],
+        }
+    )
+
+    # Calculate drift percentage with direction
+    moments["Drift (%)"] = (
+        (moments["Monitoring"] - moments["Reference"])
+        / moments["Reference"].abs()
+        * 100
+    ).round(2)
+
+    # Add Pass/Fail column based on absolute drift
+    moments["Pass/Fail"] = (
+        moments["Drift (%)"]
+        .abs()
+        .apply(lambda x: "Pass" if x < drift_pct_threshold else "Fail")
+    )
+
+    # Set Statistic as index but keep it as a column
+    moments = moments.set_index("Statistic", drop=False)
+
+    # Create KDE for both distributions
+    ref_kde = ff.create_distplot(
+        [pred_ref["Reference Prediction"].values],
+        ["Reference"],
+        show_hist=False,
+        show_rug=False,
     )
-    plot = sns.kdeplot(
-        pred_monitor["Monitoring Prediction"], fill=True, label="Monitor Prediction"
+    monitor_kde = ff.create_distplot(
+        [pred_monitor["Monitoring Prediction"].values],
+        ["Monitoring"],
+        show_hist=False,
+        show_rug=False,
     )
-    plot.set(
-        xlabel="Prediction", title="Distribution of Reference & Monitor Predictions"
+
+    # Create new figure
+    fig = go.Figure()
+
+    # Add reference distribution
+    fig.add_trace(
+        go.Scatter(
+            x=ref_kde.data[0].x,
+            y=ref_kde.data[0].y,
+            fill="tozeroy",
+            name="Reference Prediction",
+            line=dict(color="blue", width=2),
+            opacity=0.6,
+        )
+    )
+
+    # Add monitoring distribution
+    fig.add_trace(
+        go.Scatter(
+            x=monitor_kde.data[0].x,
+            y=monitor_kde.data[0].y,
+            fill="tozeroy",
+            name="Monitor Prediction",
+            line=dict(color="red", width=2),
+            opacity=0.6,
+        )
+    )
+
+    # Update layout
+    fig.update_layout(
+        title="Distribution of Reference & Monitor Predictions",
+        xaxis_title="Prediction",
+        yaxis_title="Density",
+        showlegend=True,
+        template="plotly_white",
+        hovermode="x unified",
     )
-    plot.legend()
 
-    plt.close()
+    pass_fail_bool = (moments["Pass/Fail"] == "Pass").all()
 
-    return fig
+    return ({"Distribution Moments": moments}, fig, pass_fail_bool)

From fdb6a02fbb013f3ad93a77f0e79ed4b96a1f455a Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Thu, 9 Jan 2025 20:26:41 +0100
Subject: [PATCH 02/14] Add ongoing monitoring metrics

---
 ...ication_scorecard_ongoing_monitoring.ipynb | 223 +++++++++++++++++-
 .../CalibrationCurveDrift.py                  | 188 +++++++++++++++
 .../ClassDiscriminationDrift.py               | 122 ++++++++++
 .../ClassificationAccuracyDrift.py            | 116 +++++++++
 .../ConfusionMatrixDrift.py                   | 161 +++++++++++++
 .../CumulativePredictionProbabilitiesDrift.py | 149 ++++++++++++
 .../PredictionProbabilitiesDrift.py           |   1 +
 .../PredictionProbabilitiesHistogramDrift.py  | 175 ++++++++++++++
 .../tests/ongoing_monitoring/ROCCurveDrift.py | 119 ++++++++++
 .../ongoing_monitoring/ScoreBandsDrift.py     | 183 ++++++++++++++
 .../ScorecardHistogramDrift.py                | 179 ++++++++++++++
 11 files changed, 1608 insertions(+), 8 deletions(-)
 create mode 100644 validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py
 create mode 100644 validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py
 create mode 100644 validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py
 create mode 100644 validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py
 create mode 100644 validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py
 create mode 100644 validmind/tests/ongoing_monitoring/PredictionProbabilitiesDrift.py
 create mode 100644 validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py
 create mode 100644 validmind/tests/ongoing_monitoring/ROCCurveDrift.py
 create mode 100644 validmind/tests/ongoing_monitoring/ScoreBandsDrift.py
 create mode 100644 validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py

diff --git a/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb b/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
index c0ff3bbab..3481b342a 100644
--- a/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
+++ b/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
@@ -450,11 +450,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
+    "run=False\n",
     "if run:\n",
     "\n",
     "    run_test(\n",
@@ -478,11 +478,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
+    "run=False\n",
     "if run:\n",
     "\n",
     "    run_test(\n",
@@ -506,11 +506,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
+    "run=False\n",
     "if run:\n",
     "\n",
     "    run_test(\n",
@@ -533,11 +533,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
+    "run=False\n",
     "if run:\n",
     "\n",
     "    run_test(\n",
@@ -551,6 +551,213 @@
     "        },\n",
     "    )\n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Classification accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.ClassificationAccuracyDrift\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        },\n",
+    "        params={\n",
+    "            \"drift_pct_threshold\": 5,\n",
+    "        },\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.ConfusionMatrixDrift\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        },\n",
+    "        params={\n",
+    "            \"drift_pct_threshold\": 5,\n",
+    "        },\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.CalibrationCurveDrift\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        },\n",
+    "        params={\n",
+    "            \"n_bins\": 10,\n",
+    "            \"drift_pct_threshold\": 10,\n",
+    "        },\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Class discrimination"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.ClassDiscriminationDrift\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        },\n",
+    "        params={\n",
+    "            \"drift_pct_threshold\": 5,\n",
+    "        },\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.ROCCurveDrift\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        }\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.PredictionProbabilitiesHistogramDrift\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        },\n",
+    "        params={\n",
+    "            \"drift_pct_threshold\": 10,\n",
+    "        },\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.CumulativePredictionProbabilitiesDrift\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        }\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Scoring"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=True\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.ScorecardHistogramDrift\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        },\n",
+    "        params={\n",
+    "            \"score_column\": \"xgb_scores\",\n",
+    "            \"drift_pct_threshold\": 20,\n",
+    "        },\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=True\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.ScoreBandsDrift\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        },\n",
+    "        params={\n",
+    "            \"score_column\": \"xgb_scores\",\n",
+    "            \"score_bands\": [500, 540, 570],\n",
+    "            \"drift_pct_threshold\": 20,\n",
+    "        },\n",
+    "    )"
+   ]
   }
  ],
  "metadata": {
diff --git a/validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py b/validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py
new file mode 100644
index 000000000..42db8b95a
--- /dev/null
+++ b/validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py
@@ -0,0 +1,188 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from sklearn.calibration import calibration_curve
+from typing import List
+from validmind import tags, tasks
+from validmind.errors import SkipTestError
+from validmind.vm_models import VMDataset, VMModel
+
+
+@tags(
+    "sklearn",
+    "binary_classification",
+    "model_performance",
+    "visualization",
+)
+@tasks("classification", "text_classification")
+def CalibrationCurveDrift(
+    datasets: List[VMDataset],
+    model: VMModel,
+    n_bins: int = 10,
+    drift_pct_threshold: float = 20,
+):
+    """
+    Compares calibration curves between reference and monitoring datasets.
+
+    ### Purpose
+    This test visualizes and quantifies differences in probability calibration between
+    reference and monitoring datasets to identify changes in model's probability estimates.
+
+    ### Test Mechanism
+    Generates a plot with superimposed calibration curves and two tables comparing:
+    1. Mean predicted probabilities per bin
+    2. Actual fraction of positives per bin
+
+    ### Signs of High Risk
+    - Large differences between calibration curves
+    - Systematic over/under-estimation in monitoring dataset
+    - Changes in calibration for specific probability ranges
+    """
+    # Check for binary classification
+    if len(np.unique(datasets[0].y)) > 2:
+        raise SkipTestError(
+            "Calibration Curve Drift is only supported for binary classification models"
+        )
+
+    # Calculate calibration for reference dataset
+    y_prob_ref = datasets[0].y_prob(model)
+    y_true_ref = datasets[0].y.astype(y_prob_ref.dtype).flatten()
+    prob_true_ref, prob_pred_ref = calibration_curve(
+        y_true_ref, y_prob_ref, n_bins=n_bins, strategy="uniform"
+    )
+
+    # Calculate calibration for monitoring dataset
+    y_prob_mon = datasets[1].y_prob(model)
+    y_true_mon = datasets[1].y.astype(y_prob_mon.dtype).flatten()
+    prob_true_mon, prob_pred_mon = calibration_curve(
+        y_true_mon, y_prob_mon, n_bins=n_bins, strategy="uniform"
+    )
+
+    # Create bin labels
+    bin_edges = np.linspace(0, 1, n_bins + 1)
+    bin_labels = [f"{bin_edges[i]:.1f}-{bin_edges[i+1]:.1f}" for i in range(n_bins)]
+
+    # Create predicted probabilities table
+    pred_metrics = []
+    for i in range(n_bins):
+        ref_val = "no data" if i >= len(prob_pred_ref) else round(prob_pred_ref[i], 3)
+        mon_val = "no data" if i >= len(prob_pred_mon) else round(prob_pred_mon[i], 3)
+
+        pred_metrics.append(
+            {"Bin": bin_labels[i], "Reference": ref_val, "Monitoring": mon_val}
+        )
+
+    pred_df = pd.DataFrame(pred_metrics)
+
+    # Calculate drift only for bins with data
+    mask = (pred_df["Reference"] != "no data") & (pred_df["Monitoring"] != "no data")
+    pred_df["Drift (%)"] = None
+    pred_df.loc[mask, "Drift (%)"] = (
+        (
+            pd.to_numeric(pred_df.loc[mask, "Monitoring"])
+            - pd.to_numeric(pred_df.loc[mask, "Reference"])
+        )
+        / pd.to_numeric(pred_df.loc[mask, "Reference"]).abs()
+        * 100
+    ).round(2)
+
+    pred_df["Pass/Fail"] = None
+    pred_df.loc[mask, "Pass/Fail"] = (
+        pred_df.loc[mask, "Drift (%)"]
+        .abs()
+        .apply(lambda x: "Pass" if x < drift_pct_threshold else "Fail")
+    )
+    pred_df.loc[~mask, "Pass/Fail"] = "N/A"
+
+    # Create fraction of positives table
+    true_metrics = []
+    for i in range(n_bins):
+        ref_val = "no data" if i >= len(prob_true_ref) else round(prob_true_ref[i], 3)
+        mon_val = "no data" if i >= len(prob_true_mon) else round(prob_true_mon[i], 3)
+
+        true_metrics.append(
+            {"Bin": bin_labels[i], "Reference": ref_val, "Monitoring": mon_val}
+        )
+
+    true_df = pd.DataFrame(true_metrics)
+
+    # Calculate drift only for bins with data
+    mask = (true_df["Reference"] != "no data") & (true_df["Monitoring"] != "no data")
+    true_df["Drift (%)"] = None
+    true_df.loc[mask, "Drift (%)"] = (
+        (
+            pd.to_numeric(true_df.loc[mask, "Monitoring"])
+            - pd.to_numeric(true_df.loc[mask, "Reference"])
+        )
+        / pd.to_numeric(true_df.loc[mask, "Reference"]).abs()
+        * 100
+    ).round(2)
+
+    true_df["Pass/Fail"] = None
+    true_df.loc[mask, "Pass/Fail"] = (
+        true_df.loc[mask, "Drift (%)"]
+        .abs()
+        .apply(lambda x: "Pass" if x < drift_pct_threshold else "Fail")
+    )
+    true_df.loc[~mask, "Pass/Fail"] = "N/A"
+
+    # Create figure
+    fig = go.Figure()
+
+    # Add perfect calibration line
+    fig.add_trace(
+        go.Scatter(
+            x=[0, 1],
+            y=[0, 1],
+            mode="lines",
+            name="Perfect Calibration",
+            line=dict(color="grey", dash="dash"),
+        )
+    )
+
+    # Add reference calibration curve
+    fig.add_trace(
+        go.Scatter(
+            x=prob_pred_ref,
+            y=prob_true_ref,
+            mode="lines+markers",
+            name="Reference",
+            line=dict(color="blue", width=2),
+            marker=dict(size=8),
+        )
+    )
+
+    # Add monitoring calibration curve
+    fig.add_trace(
+        go.Scatter(
+            x=prob_pred_mon,
+            y=prob_true_mon,
+            mode="lines+markers",
+            name="Monitoring",
+            line=dict(color="red", width=2),
+            marker=dict(size=8),
+        )
+    )
+
+    fig.update_layout(
+        title="Calibration Curves Comparison",
+        xaxis=dict(title="Mean Predicted Probability", range=[0, 1]),
+        yaxis=dict(title="Fraction of Positives", range=[0, 1]),
+        width=700,
+        height=500,
+    )
+
+    # Calculate overall pass/fail (only for bins with data)
+    pass_fail_bool = (pred_df.loc[mask, "Pass/Fail"] == "Pass").all() and (
+        true_df.loc[mask, "Pass/Fail"] == "Pass"
+    ).all()
+
+    return (
+        fig,
+        {"Mean Predicted Probabilities": pred_df, "Fraction of Positives": true_df},
+        pass_fail_bool,
+    )
diff --git a/validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py b/validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py
new file mode 100644
index 000000000..169f47ac8
--- /dev/null
+++ b/validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py
@@ -0,0 +1,122 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from sklearn.preprocessing import LabelBinarizer
+from scipy import stats
+from typing import List
+from validmind import tags, tasks
+from validmind.vm_models import VMDataset, VMModel
+
+
+def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
+    lb = LabelBinarizer()
+    lb.fit(y_test)
+    return roc_auc_score(lb.transform(y_test), lb.transform(y_pred), average=average)
+
+
+def calculate_gini(y_true, y_prob):
+    """Calculate Gini coefficient (2*AUC - 1)"""
+    return 2 * roc_auc_score(y_true, y_prob) - 1
+
+
+def calculate_ks_statistic(y_true, y_prob):
+    """Calculate Kolmogorov-Smirnov statistic"""
+    pos_scores = y_prob[y_true == 1]
+    neg_scores = y_prob[y_true == 0]
+    return stats.ks_2samp(pos_scores, neg_scores).statistic
+
+
+@tags(
+    "sklearn", "binary_classification", "multiclass_classification", "model_performance"
+)
+@tasks("classification", "text_classification")
+def ClassDiscriminationDrift(
+    datasets: List[VMDataset], model: VMModel, drift_pct_threshold=20
+):
+    """
+    Compares classification discrimination metrics between reference and monitoring datasets.
+
+    ### Purpose
+    This test evaluates drift in discrimination metrics including ROC AUC, GINI, and KS statistics.
+
+    ### Test Mechanism
+    Calculates discrimination metrics for both reference and monitoring datasets and
+    compares them to identify significant changes in model's discriminative power.
+
+    ### Signs of High Risk
+    - Large drifts in discrimination metrics (above threshold)
+    - Significant drops in ROC AUC or GINI coefficient
+    - Reduced class separation as indicated by KS statistic
+    """
+    # Get predictions and true values
+    y_true_ref = datasets[0].y
+    y_true_mon = datasets[1].y
+
+    metrics = []
+
+    # Handle binary vs multiclass
+    if len(np.unique(y_true_ref)) == 2:
+        # Binary classification
+        y_prob_ref = datasets[0].y_prob(model)
+        y_prob_mon = datasets[1].y_prob(model)
+
+        # ROC AUC
+        roc_auc_ref = roc_auc_score(y_true_ref, y_prob_ref)
+        roc_auc_mon = roc_auc_score(y_true_mon, y_prob_mon)
+        metrics.append(
+            {"Metric": "ROC_AUC", "Reference": roc_auc_ref, "Monitoring": roc_auc_mon}
+        )
+
+        # GINI
+        gini_ref = calculate_gini(y_true_ref, y_prob_ref)
+        gini_mon = calculate_gini(y_true_mon, y_prob_mon)
+        metrics.append(
+            {"Metric": "GINI", "Reference": gini_ref, "Monitoring": gini_mon}
+        )
+
+        # KS Statistic
+        ks_ref = calculate_ks_statistic(y_true_ref, y_prob_ref)
+        ks_mon = calculate_ks_statistic(y_true_mon, y_prob_mon)
+        metrics.append(
+            {"Metric": "KS_Statistic", "Reference": ks_ref, "Monitoring": ks_mon}
+        )
+
+    else:
+        # Multiclass
+        y_pred_ref = datasets[0].y_pred(model)
+        y_pred_mon = datasets[1].y_pred(model)
+
+        # Only ROC AUC for multiclass
+        roc_auc_ref = multiclass_roc_auc_score(y_true_ref, y_pred_ref)
+        roc_auc_mon = multiclass_roc_auc_score(y_true_mon, y_pred_mon)
+        metrics.append(
+            {
+                "Metric": "ROC_AUC_Macro",
+                "Reference": roc_auc_ref,
+                "Monitoring": roc_auc_mon,
+            }
+        )
+
+    # Create DataFrame
+    df = pd.DataFrame(metrics)
+
+    # Calculate drift percentage with direction
+    df["Drift (%)"] = (
+        (df["Monitoring"] - df["Reference"]) / df["Reference"].abs() * 100
+    ).round(2)
+
+    # Add Pass/Fail column based on absolute drift
+    df["Pass/Fail"] = (
+        df["Drift (%)"]
+        .abs()
+        .apply(lambda x: "Pass" if x < drift_pct_threshold else "Fail")
+    )
+
+    # Calculate overall pass/fail
+    pass_fail_bool = (df["Pass/Fail"] == "Pass").all()
+
+    return ({"Classification Discrimination Metrics": df}, pass_fail_bool)
diff --git a/validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py b/validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py
new file mode 100644
index 000000000..e07106a4c
--- /dev/null
+++ b/validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py
@@ -0,0 +1,116 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import classification_report
+from typing import List
+from validmind import tags, tasks
+from validmind.vm_models import VMDataset, VMModel
+
+
+@tags(
+    "sklearn", "binary_classification", "multiclass_classification", "model_performance"
+)
+@tasks("classification", "text_classification")
+def ClassificationAccuracyDrift(
+    datasets: List[VMDataset], model: VMModel, drift_pct_threshold=20
+):
+    """
+    Compares classification accuracy metrics between reference and monitoring datasets.
+
+    ### Purpose
+    This test evaluates drift in classification accuracy metrics including per-label and
+    macro-averaged precision, recall, and F1 scores.
+
+    ### Test Mechanism
+    Calculates classification metrics for both reference and monitoring datasets and
+    compares them to identify significant changes in model performance.
+
+    ### Signs of High Risk
+    - Large drifts in accuracy metrics (above threshold)
+    - Inconsistent changes across different labels
+    - Significant drops in macro-averaged metrics
+    """
+    # Get predictions and true values
+    y_true_ref = datasets[0].y
+    y_pred_ref = datasets[0].y_pred(model)
+
+    y_true_mon = datasets[1].y
+    y_pred_mon = datasets[1].y_pred(model)
+
+    # Get unique labels from reference dataset
+    labels = np.unique(y_true_ref)
+    labels = sorted(labels.tolist())
+
+    # Calculate classification reports
+    report_ref = classification_report(
+        y_true=y_true_ref,
+        y_pred=y_pred_ref,
+        output_dict=True,
+        zero_division=0,
+    )
+
+    report_mon = classification_report(
+        y_true=y_true_mon,
+        y_pred=y_pred_mon,
+        output_dict=True,
+        zero_division=0,
+    )
+
+    # Create metrics dataframe
+    metrics = []
+
+    # Add accuracy
+    metrics.append(
+        {
+            "Metric": "Accuracy",
+            "Reference": report_ref["accuracy"],
+            "Monitoring": report_mon["accuracy"],
+        }
+    )
+
+    # Add per-label metrics
+    for label in labels:
+        label_str = str(label)
+        for metric in ["precision", "recall", "f1-score"]:
+            metric_name = f"{metric.title()}_{label_str}"
+            metrics.append(
+                {
+                    "Metric": metric_name,
+                    "Reference": report_ref[label_str][metric],
+                    "Monitoring": report_mon[label_str][metric],
+                }
+            )
+
+    # Add macro averages
+    for metric in ["precision", "recall", "f1-score"]:
+        metric_name = f"{metric.title()}_Macro"
+        metrics.append(
+            {
+                "Metric": metric_name,
+                "Reference": report_ref["macro avg"][metric],
+                "Monitoring": report_mon["macro avg"][metric],
+            }
+        )
+
+    # Create DataFrame
+    df = pd.DataFrame(metrics)
+
+    # Calculate drift percentage with direction
+    df["Drift (%)"] = (
+        (df["Monitoring"] - df["Reference"]) / df["Reference"].abs() * 100
+    ).round(2)
+
+    # Add Pass/Fail column based on absolute drift
+    df["Pass/Fail"] = (
+        df["Drift (%)"]
+        .abs()
+        .apply(lambda x: "Pass" if x < drift_pct_threshold else "Fail")
+    )
+
+    # Calculate overall pass/fail
+    pass_fail_bool = (df["Pass/Fail"] == "Pass").all()
+
+    return ({"Classification Accuracy Metrics": df}, pass_fail_bool)
diff --git a/validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py b/validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py
new file mode 100644
index 000000000..8edf0a5fe
--- /dev/null
+++ b/validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py
@@ -0,0 +1,161 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import confusion_matrix
+from typing import List
+from validmind import tags, tasks
+from validmind.vm_models import VMDataset, VMModel
+
+
+@tags(
+    "sklearn", "binary_classification", "multiclass_classification", "model_performance"
+)
+@tasks("classification", "text_classification")
+def ConfusionMatrixDrift(
+    datasets: List[VMDataset], model: VMModel, drift_pct_threshold=20
+):
+    """
+    Compares confusion matrix metrics between reference and monitoring datasets.
+
+    ### Purpose
+    This test evaluates drift in confusion matrix elements including True Positives,
+    True Negatives, False Positives, and False Negatives.
+
+    ### Test Mechanism
+    Calculates confusion matrices for both reference and monitoring datasets and
+    compares corresponding elements to identify significant changes in model predictions.
+
+    ### Signs of High Risk
+    - Large drifts in confusion matrix elements (above threshold)
+    - Significant changes in error patterns (FP, FN)
+    - Inconsistent changes across different classes
+    """
+    # Get predictions and true values for reference dataset
+    y_pred_ref = datasets[0].y_pred(model)
+    y_true_ref = datasets[0].y.astype(y_pred_ref.dtype)
+
+    # Get predictions and true values for monitoring dataset
+    y_pred_mon = datasets[1].y_pred(model)
+    y_true_mon = datasets[1].y.astype(y_pred_mon.dtype)
+
+    # Get unique labels from reference dataset
+    labels = np.unique(y_true_ref)
+    labels = sorted(labels.tolist())
+
+    # Calculate confusion matrices
+    cm_ref = confusion_matrix(y_true_ref, y_pred_ref, labels=labels)
+    cm_mon = confusion_matrix(y_true_mon, y_pred_mon, labels=labels)
+
+    # Get total counts
+    total_ref = len(y_true_ref)
+    total_mon = len(y_true_mon)
+
+    # Create sample counts table
+    counts_data = {
+        "Dataset": ["Reference", "Monitoring"],
+        "Total": [total_ref, total_mon],
+    }
+
+    # Add per-class counts
+    for label in labels:
+        label_str = f"Class_{label}"
+        counts_data[label_str] = [
+            np.sum(y_true_ref == label),
+            np.sum(y_true_mon == label),
+        ]
+
+    counts_df = pd.DataFrame(counts_data)
+
+    # Create confusion matrix metrics
+    metrics = []
+
+    if len(labels) == 2:
+        # Binary classification
+        tn_ref, fp_ref, fn_ref, tp_ref = cm_ref.ravel()
+        tn_mon, fp_mon, fn_mon, tp_mon = cm_mon.ravel()
+
+        confusion_elements = [
+            ("True Negatives (%)", tn_ref / total_ref * 100, tn_mon / total_mon * 100),
+            ("False Positives (%)", fp_ref / total_ref * 100, fp_mon / total_mon * 100),
+            ("False Negatives (%)", fn_ref / total_ref * 100, fn_mon / total_mon * 100),
+            ("True Positives (%)", tp_ref / total_ref * 100, tp_mon / total_mon * 100),
+        ]
+
+        for name, ref_val, mon_val in confusion_elements:
+            metrics.append(
+                {
+                    "Metric": name,
+                    "Reference": round(ref_val, 2),
+                    "Monitoring": round(mon_val, 2),
+                }
+            )
+
+    else:
+        # Multiclass - calculate per-class metrics
+        for i, label in enumerate(labels):
+            # True Positives for this class
+            tp_ref = cm_ref[i, i]
+            tp_mon = cm_mon[i, i]
+
+            # False Positives (sum of column minus TP)
+            fp_ref = cm_ref[:, i].sum() - tp_ref
+            fp_mon = cm_mon[:, i].sum() - tp_mon
+
+            # False Negatives (sum of row minus TP)
+            fn_ref = cm_ref[i, :].sum() - tp_ref
+            fn_mon = cm_mon[i, :].sum() - tp_mon
+
+            class_metrics = [
+                (
+                    f"True Positives_{label} (%)",
+                    tp_ref / total_ref * 100,
+                    tp_mon / total_mon * 100,
+                ),
+                (
+                    f"False Positives_{label} (%)",
+                    fp_ref / total_ref * 100,
+                    fp_mon / total_mon * 100,
+                ),
+                (
+                    f"False Negatives_{label} (%)",
+                    fn_ref / total_ref * 100,
+                    fn_mon / total_mon * 100,
+                ),
+            ]
+
+            for name, ref_val, mon_val in class_metrics:
+                metrics.append(
+                    {
+                        "Metric": name,
+                        "Reference": round(ref_val, 2),
+                        "Monitoring": round(mon_val, 2),
+                    }
+                )
+
+    # Create metrics DataFrame
+    metrics_df = pd.DataFrame(metrics)
+
+    # Calculate drift percentage with direction
+    metrics_df["Drift (%)"] = (
+        (metrics_df["Monitoring"] - metrics_df["Reference"])
+        / metrics_df["Reference"].abs()
+        * 100
+    ).round(2)
+
+    # Add Pass/Fail column based on absolute drift
+    metrics_df["Pass/Fail"] = (
+        metrics_df["Drift (%)"]
+        .abs()
+        .apply(lambda x: "Pass" if x < drift_pct_threshold else "Fail")
+    )
+
+    # Calculate overall pass/fail
+    pass_fail_bool = (metrics_df["Pass/Fail"] == "Pass").all()
+
+    return (
+        {"Confusion Matrix Metrics": metrics_df, "Sample Counts": counts_df},
+        pass_fail_bool,
+    )
diff --git a/validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py b/validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py
new file mode 100644
index 000000000..6d05b7624
--- /dev/null
+++ b/validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py
@@ -0,0 +1,149 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+import numpy as np
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+from typing import List, Tuple
+from validmind import tags, tasks
+from validmind.vm_models import VMDataset, VMModel
+
+
+@tags("visualization", "credit_risk")
+@tasks("classification")
+def CumulativePredictionProbabilitiesDrift(
+    datasets: List[VMDataset],
+    model: VMModel,
+):
+    """
+    Compares cumulative prediction probability distributions between reference and
+    monitoring datasets for each class.
+
+    ### Purpose
+    This test visualizes changes in the model's cumulative probability predictions between
+    reference and monitoring datasets by comparing their distributions for each class.
+
+    ### Test Mechanism
+    For each class, creates a figure with two subplots:
+    1. Cumulative distributions comparison
+    2. Difference between monitoring and reference distributions
+
+    ### Signs of High Risk
+    - Significant shifts in cumulative distributions
+    - Large differences between reference and monitoring curves
+    - Systematic differences across probability ranges
+    """
+    # Get predictions and true values
+    y_true_ref = datasets[0].y
+    y_prob_ref = datasets[0].y_prob(model)
+    df_ref = datasets[0].df.copy()
+    df_ref["probabilities"] = y_prob_ref
+
+    y_true_mon = datasets[1].y
+    y_prob_mon = datasets[1].y_prob(model)
+    df_mon = datasets[1].df.copy()
+    df_mon["probabilities"] = y_prob_mon
+
+    # Get unique classes
+    classes = sorted(df_ref[datasets[0].target_column].unique())
+
+    # Define colors
+    ref_color = "rgba(31, 119, 180, 0.8)"  # Blue with 0.8 opacity
+    mon_color = "rgba(255, 127, 14, 0.8)"  # Orange with 0.8 opacity
+    diff_color = "rgba(148, 103, 189, 0.8)"  # Purple with 0.8 opacity
+
+    figures = []
+    for class_value in classes:
+        # Create figure with secondary y-axis
+        fig = make_subplots(
+            rows=2,
+            cols=1,
+            subplot_titles=[
+                f"Cumulative Distributions - Class {class_value}",
+                "Difference (Monitoring - Reference)",
+            ],
+            vertical_spacing=0.15,
+            shared_xaxes=True,
+        )
+
+        # Get probabilities for current class
+        ref_probs = df_ref[df_ref[datasets[0].target_column] == class_value][
+            "probabilities"
+        ]
+        mon_probs = df_mon[df_mon[datasets[1].target_column] == class_value][
+            "probabilities"
+        ]
+
+        # Calculate cumulative distributions
+        ref_sorted = np.sort(ref_probs)
+        ref_cumsum = np.arange(len(ref_sorted)) / float(len(ref_sorted))
+
+        mon_sorted = np.sort(mon_probs)
+        mon_cumsum = np.arange(len(mon_sorted)) / float(len(mon_sorted))
+
+        # Reference dataset cumulative curve
+        fig.add_trace(
+            go.Scatter(
+                x=ref_sorted,
+                y=ref_cumsum,
+                mode="lines",
+                name="Reference",
+                line=dict(color=ref_color, width=2),
+            ),
+            row=1,
+            col=1,
+        )
+
+        # Monitoring dataset cumulative curve
+        fig.add_trace(
+            go.Scatter(
+                x=mon_sorted,
+                y=mon_cumsum,
+                mode="lines",
+                name="Monitoring",
+                line=dict(color=mon_color, width=2),
+            ),
+            row=1,
+            col=1,
+        )
+
+        # Calculate and plot difference
+        # Interpolate monitoring values to match reference x-points
+        mon_interp = np.interp(ref_sorted, mon_sorted, mon_cumsum)
+        difference = mon_interp - ref_cumsum
+
+        fig.add_trace(
+            go.Scatter(
+                x=ref_sorted,
+                y=difference,
+                mode="lines",
+                name="Difference",
+                line=dict(color=diff_color, width=2),
+            ),
+            row=2,
+            col=1,
+        )
+
+        # Add horizontal line at y=0 for difference plot
+        fig.add_hline(y=0, line=dict(color="grey", dash="dash"), row=2, col=1)
+
+        # Update layout
+        fig.update_layout(
+            height=600,
+            width=800,
+            showlegend=True,
+            legend=dict(yanchor="middle", y=0.9, xanchor="left", x=1.05),
+        )
+
+        # Update axes
+        fig.update_xaxes(title_text="Probability", range=[0, 1], row=2, col=1)
+        fig.update_xaxes(range=[0, 1], row=1, col=1)
+        fig.update_yaxes(
+            title_text="Cumulative Distribution", range=[0, 1], row=1, col=1
+        )
+        fig.update_yaxes(title_text="Difference", row=2, col=1)
+
+        figures.append(fig)
+
+    return tuple(figures)
diff --git a/validmind/tests/ongoing_monitoring/PredictionProbabilitiesDrift.py b/validmind/tests/ongoing_monitoring/PredictionProbabilitiesDrift.py
new file mode 100644
index 000000000..0519ecba6
--- /dev/null
+++ b/validmind/tests/ongoing_monitoring/PredictionProbabilitiesDrift.py
@@ -0,0 +1 @@
+ 
\ No newline at end of file
diff --git a/validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py b/validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py
new file mode 100644
index 000000000..b975526fe
--- /dev/null
+++ b/validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py
@@ -0,0 +1,175 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+from scipy import stats
+from typing import List
+from validmind import tags, tasks
+from validmind.vm_models import VMDataset, VMModel
+
+
+@tags("visualization", "credit_risk")
+@tasks("classification")
+def PredictionProbabilitiesHistogramDrift(
+    datasets: List[VMDataset],
+    model: VMModel,
+    title="Prediction Probabilities Histogram Drift",
+    drift_pct_threshold: float = 20.0,
+):
+    """
+    Compares prediction probability distributions between reference and monitoring datasets
+    for each class.
+
+    ### Purpose
+    This test visualizes and quantifies changes in the model's probability predictions
+    between reference and monitoring datasets by comparing their distributions for each class.
+
+    ### Test Mechanism
+    - Creates histograms of prediction probabilities for each class
+    - Superimposes reference and monitoring distributions
+    - Computes distribution moments and their drift
+    - Uses separate subplots for each class for clear comparison
+
+    ### Signs of High Risk
+    - Significant shifts in probability distributions
+    - Changes in the shape of distributions
+    - New modes or peaks appearing in monitoring data
+    - Large differences in distribution moments
+    """
+    # Get predictions and true values
+    y_prob_ref = datasets[0].y_prob(model)
+    df_ref = datasets[0].df.copy()
+    df_ref["probabilities"] = y_prob_ref
+
+    y_prob_mon = datasets[1].y_prob(model)
+    df_mon = datasets[1].df.copy()
+    df_mon["probabilities"] = y_prob_mon
+
+    # Get unique classes
+    classes = sorted(df_ref[datasets[0].target_column].unique())
+
+    # Create subplots with more horizontal space for legends
+    fig = make_subplots(
+        rows=len(classes),
+        cols=1,
+        subplot_titles=[f"Class {cls}" for cls in classes],
+        horizontal_spacing=0.15,
+    )
+
+    # Define colors
+    ref_color = "rgba(31, 119, 180, 0.8)"  # Blue with 0.8 opacity
+    mon_color = "rgba(255, 127, 14, 0.8)"  # Orange with 0.8 opacity
+
+    # Dictionary to store tables for each class
+    tables = {}
+    all_passed = True  # Track overall pass/fail
+
+    # Add histograms and create tables for each class
+    for i, class_value in enumerate(classes, start=1):
+        # Get probabilities for current class
+        ref_probs = df_ref[df_ref[datasets[0].target_column] == class_value][
+            "probabilities"
+        ]
+        mon_probs = df_mon[df_mon[datasets[1].target_column] == class_value][
+            "probabilities"
+        ]
+
+        # Calculate distribution moments
+        ref_stats = {
+            "Mean": np.mean(ref_probs),
+            "Variance": np.var(ref_probs),
+            "Skewness": stats.skew(ref_probs),
+            "Kurtosis": stats.kurtosis(ref_probs),
+        }
+
+        mon_stats = {
+            "Mean": np.mean(mon_probs),
+            "Variance": np.var(mon_probs),
+            "Skewness": stats.skew(mon_probs),
+            "Kurtosis": stats.kurtosis(mon_probs),
+        }
+
+        # Create table for this class
+        table_data = []
+        class_passed = True  # Track pass/fail for this class
+
+        for stat_name in ["Mean", "Variance", "Skewness", "Kurtosis"]:
+            ref_val = ref_stats[stat_name]
+            mon_val = mon_stats[stat_name]
+            drift = (
+                ((mon_val - ref_val) / abs(ref_val)) * 100 if ref_val != 0 else np.inf
+            )
+            passed = abs(drift) < drift_pct_threshold
+            class_passed &= passed  # Update class pass/fail
+
+            table_data.append(
+                {
+                    "Statistic": stat_name,
+                    "Reference": round(ref_val, 4),
+                    "Monitoring": round(mon_val, 4),
+                    "Drift (%)": round(drift, 2),
+                    "Pass/Fail": "Pass" if passed else "Fail",
+                }
+            )
+
+        tables[f"Class {class_value}"] = pd.DataFrame(table_data)
+        all_passed &= class_passed  # Update overall pass/fail
+
+        # Reference dataset histogram
+        fig.add_trace(
+            go.Histogram(
+                x=ref_probs,
+                name=f"Reference - Class {class_value}",
+                marker_color=ref_color,
+                showlegend=True,
+                legendrank=i * 2 - 1,
+            ),
+            row=i,
+            col=1,
+        )
+
+        # Monitoring dataset histogram
+        fig.add_trace(
+            go.Histogram(
+                x=mon_probs,
+                name=f"Monitoring - Class {class_value}",
+                marker_color=mon_color,
+                showlegend=True,
+                legendrank=i * 2,
+            ),
+            row=i,
+            col=1,
+        )
+
+    # Update layout
+    fig.update_layout(
+        title_text=title,
+        barmode="overlay",
+        height=300 * len(classes),
+        width=1000,
+        showlegend=True,
+    )
+
+    # Update axes labels and add separate legends for each subplot
+    for i in range(len(classes)):
+        fig.update_xaxes(title_text="Probability", row=i + 1, col=1)
+        fig.update_yaxes(title_text="Frequency", row=i + 1, col=1)
+
+        # Add separate legend for each subplot
+        fig.update_layout(
+            **{
+                f'legend{i+1 if i > 0 else ""}': dict(
+                    yanchor="middle",
+                    y=1 - (i / len(classes)) - (0.5 / len(classes)),
+                    xanchor="left",
+                    x=1.05,
+                    tracegroupgap=5,
+                )
+            }
+        )
+
+    return fig, tables, all_passed
diff --git a/validmind/tests/ongoing_monitoring/ROCCurveDrift.py b/validmind/tests/ongoing_monitoring/ROCCurveDrift.py
new file mode 100644
index 000000000..c8d29459e
--- /dev/null
+++ b/validmind/tests/ongoing_monitoring/ROCCurveDrift.py
@@ -0,0 +1,119 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+import numpy as np
+import plotly.graph_objects as go
+from sklearn.metrics import roc_auc_score, roc_curve
+from validmind import tags, tasks
+from validmind.errors import SkipTestError
+from validmind.vm_models import VMDataset, VMModel
+
+from typing import List
+
+
+@tags(
+    "sklearn",
+    "binary_classification",
+    "model_performance",
+    "visualization",
+)
+@tasks("classification", "text_classification")
+def ROCCurveDrift(datasets: List[VMDataset], model: VMModel):
+    """
+    Compares ROC curves between reference and monitoring datasets.
+
+    ### Purpose
+    This test visualizes the differences in ROC curves and AUC scores between reference
+    and monitoring datasets to identify changes in model's discriminative ability.
+
+    ### Test Mechanism
+    Generates two plots:
+    1. Superimposed ROC curves for both datasets
+    2. Difference between ROC curves (Monitoring - Reference)
+
+    ### Signs of High Risk
+    - Large differences between ROC curves
+    - Significant drop in AUC score for monitoring dataset
+    - Systematic differences in specific FPR regions
+    """
+    # Check for binary classification
+    if len(np.unique(datasets[0].y)) > 2:
+        raise SkipTestError(
+            "ROC Curve Drift is only supported for binary classification models"
+        )
+
+    # Calculate ROC curves for reference dataset
+    y_prob_ref = datasets[0].y_prob(model)
+    y_true_ref = datasets[0].y.astype(y_prob_ref.dtype).flatten()
+    fpr_ref, tpr_ref, _ = roc_curve(y_true_ref, y_prob_ref, drop_intermediate=False)
+    auc_ref = roc_auc_score(y_true_ref, y_prob_ref)
+
+    # Calculate ROC curves for monitoring dataset
+    y_prob_mon = datasets[1].y_prob(model)
+    y_true_mon = datasets[1].y.astype(y_prob_mon.dtype).flatten()
+    fpr_mon, tpr_mon, _ = roc_curve(y_true_mon, y_prob_mon, drop_intermediate=False)
+    auc_mon = roc_auc_score(y_true_mon, y_prob_mon)
+
+    # Create superimposed ROC curves plot
+    fig1 = go.Figure()
+
+    fig1.add_trace(
+        go.Scatter(
+            x=fpr_ref,
+            y=tpr_ref,
+            mode="lines",
+            name=f"Reference (AUC = {auc_ref:.3f})",
+            line=dict(color="blue", width=2),
+        )
+    )
+
+    fig1.add_trace(
+        go.Scatter(
+            x=fpr_mon,
+            y=tpr_mon,
+            mode="lines",
+            name=f"Monitoring (AUC = {auc_mon:.3f})",
+            line=dict(color="red", width=2),
+        )
+    )
+
+    fig1.update_layout(
+        title="ROC Curves Comparison",
+        xaxis=dict(title="False Positive Rate"),
+        yaxis=dict(title="True Positive Rate"),
+        width=700,
+        height=500,
+    )
+
+    # Interpolate monitoring TPR to match reference FPR points
+    tpr_mon_interp = np.interp(fpr_ref, fpr_mon, tpr_mon)
+
+    # Calculate TPR difference
+    tpr_diff = tpr_mon_interp - tpr_ref
+
+    # Create difference plot
+    fig2 = go.Figure()
+
+    fig2.add_trace(
+        go.Scatter(
+            x=fpr_ref,
+            y=tpr_diff,
+            mode="lines",
+            name="TPR Difference",
+            line=dict(color="purple", width=2),
+        )
+    )
+
+    # Add horizontal line at y=0
+    fig2.add_hline(y=0, line=dict(color="grey", dash="dash"), name="No Difference")
+
+    fig2.update_layout(
+        title="ROC Curve Difference (Monitoring - Reference)",
+        xaxis=dict(title="False Positive Rate"),
+        yaxis=dict(title="TPR Difference"),
+        width=700,
+        height=500,
+    )
+
+    return fig1, fig2
diff --git a/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py b/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py
new file mode 100644
index 000000000..e4c9b4ad5
--- /dev/null
+++ b/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py
@@ -0,0 +1,183 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+import pandas as pd
+import numpy as np
+from typing import List, Dict, Tuple
+from validmind import tags, tasks
+from validmind.vm_models import VMDataset, VMModel
+
+
+@tags("visualization", "credit_risk", "scorecard")
+@tasks("classification")
+def ScoreBandsDrift(
+    datasets: List[VMDataset],
+    model: VMModel,
+    score_column: str = "score",
+    score_bands: list = None,
+    drift_threshold: float = 20.0,
+):
+    """
+    Analyzes drift in population distribution and default rates across score bands between
+    reference and monitoring datasets.
+
+    ### Purpose
+    This test evaluates changes in score band metrics between reference and monitoring
+    datasets to identify potential drift in model behavior.
+
+    ### Test Mechanism
+    Compares three key metrics across score bands:
+    1. Population distribution (%)
+    2. Predicted default rates (%)
+    3. Observed default rates (%)
+    Calculates drift percentages and flags significant changes.
+
+    ### Signs of High Risk
+    - Large shifts in population distribution across bands
+    - Significant changes in default rates
+    - Inconsistent drift patterns across bands
+    - Multiple metrics showing high drift simultaneously
+    """
+    # Validate score column
+    if score_column not in datasets[0].df.columns:
+        raise ValueError(
+            f"Score column '{score_column}' not found in reference dataset"
+        )
+    if score_column not in datasets[1].df.columns:
+        raise ValueError(
+            f"Score column '{score_column}' not found in monitoring dataset"
+        )
+
+    # Default score bands if none provided
+    if score_bands is None:
+        score_bands = [410, 440, 470]
+
+    # Create band labels
+    band_labels = [
+        f"{score_bands[i]}-{score_bands[i+1]}" for i in range(len(score_bands) - 1)
+    ]
+    band_labels.insert(0, f"<{score_bands[0]}")
+    band_labels.append(f">{score_bands[-1]}")
+
+    # Process reference and monitoring datasets
+    def process_dataset(dataset, model):
+        df = dataset.df.copy()
+        df["score_band"] = pd.cut(
+            df[score_column],
+            bins=[-np.inf] + score_bands + [np.inf],
+            labels=band_labels,
+        )
+        y_pred = dataset.y_pred(model)
+
+        results = {}
+        total_population = len(df)
+
+        # Store min and max scores
+        min_score = df[score_column].min()
+        max_score = df[score_column].max()
+
+        for band in band_labels:
+            band_mask = df["score_band"] == band
+            population = band_mask.sum()
+
+            results[band] = {
+                "Population (%)": population / total_population * 100,
+                "Predicted Default Rate (%)": (
+                    y_pred[band_mask].sum() / population * 100 if population > 0 else 0
+                ),
+                "Observed Default Rate (%)": (
+                    df[band_mask][dataset.target_column].sum() / population * 100
+                    if population > 0
+                    else 0
+                ),
+            }
+
+        results["min_score"] = min_score
+        results["max_score"] = max_score
+        return results
+
+    # Get metrics for both datasets
+    ref_results = process_dataset(datasets[0], model)
+    mon_results = process_dataset(datasets[1], model)
+
+    # Create the three comparison tables
+    tables = {}
+    all_passed = True
+
+    metrics = [
+        ("Population Distribution (%)", "Population (%)"),
+        ("Predicted Default Rates (%)", "Predicted Default Rate (%)"),
+        ("Observed Default Rates (%)", "Observed Default Rate (%)"),
+    ]
+
+    for table_name, metric in metrics:
+        rows = []
+        metric_passed = True
+
+        for band in band_labels:
+            ref_val = ref_results[band][metric]
+            mon_val = mon_results[band][metric]
+
+            # Calculate drift - using absolute difference when reference is 0
+            drift = (
+                abs(mon_val - ref_val)
+                if ref_val == 0
+                else ((mon_val - ref_val) / abs(ref_val)) * 100
+            )
+            passed = abs(drift) < drift_threshold
+            metric_passed &= passed
+
+            rows.append(
+                {
+                    "Score Band": band,
+                    "Reference": round(ref_val, 4),
+                    "Monitoring": round(mon_val, 4),
+                    "Drift (%)": round(drift, 2),
+                    "Pass/Fail": "Pass" if passed else "Fail",
+                }
+            )
+
+        # Add total row for all metrics
+        if metric == "Population (%)":
+            ref_total = 100.0
+            mon_total = 100.0
+            drift_total = 0.0
+            passed_total = True
+        else:
+            ref_total = sum(
+                ref_results[band][metric] * (ref_results[band]["Population (%)"] / 100)
+                for band in band_labels
+            )
+            mon_total = sum(
+                mon_results[band][metric] * (mon_results[band]["Population (%)"] / 100)
+                for band in band_labels
+            )
+            # Apply same drift calculation to totals
+            drift_total = (
+                abs(mon_total - ref_total)
+                if ref_total == 0
+                else ((mon_total - ref_total) / abs(ref_total)) * 100
+            )
+            passed_total = abs(drift_total) < drift_threshold
+
+        # Format total row with score ranges
+        total_label = (
+            f"Total ({ref_results['min_score']:.0f}-{ref_results['max_score']:.0f})"
+        )
+
+        rows.append(
+            {
+                "Score Band": total_label,
+                "Reference": round(ref_total, 4),
+                "Monitoring": round(mon_total, 4),
+                "Drift (%)": round(drift_total, 2),
+                "Pass/Fail": "Pass" if passed_total else "Fail",
+            }
+        )
+
+        metric_passed &= passed_total
+        tables[table_name] = pd.DataFrame(rows)
+        all_passed &= metric_passed
+
+    return tables, all_passed
diff --git a/validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py b/validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py
new file mode 100644
index 000000000..939a69d12
--- /dev/null
+++ b/validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py
@@ -0,0 +1,179 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+from scipy import stats
+from typing import List
+from validmind import tags, tasks
+from validmind.vm_models import VMDataset
+
+
+@tags("visualization", "credit_risk", "logistic_regression")
+@tasks("classification")
+def ScorecardHistogramDrift(
+    datasets: List[VMDataset],
+    score_column: str = "score",
+    title: str = "Scorecard Histogram Drift",
+    drift_pct_threshold: float = 20.0,
+):
+    """
+    Compares score distributions between reference and monitoring datasets for each class.
+
+    ### Purpose
+    This test visualizes and quantifies changes in the model's scoring between reference
+    and monitoring datasets by comparing their distributions for each class.
+
+    ### Test Mechanism
+    - Creates histograms of scores for each class
+    - Superimposes reference and monitoring distributions
+    - Computes distribution moments and their drift
+    - Uses separate subplots for each class for clear comparison
+
+    ### Signs of High Risk
+    - Significant shifts in score distributions
+    - Changes in the shape of distributions
+    - New modes or peaks appearing in monitoring data
+    - Large differences in distribution moments
+    """
+    # Verify score column exists
+    if score_column not in datasets[0].df.columns:
+        raise ValueError(
+            f"Score column '{score_column}' not found in reference dataset"
+        )
+    if score_column not in datasets[1].df.columns:
+        raise ValueError(
+            f"Score column '{score_column}' not found in monitoring dataset"
+        )
+
+    # Get reference and monitoring data
+    df_ref = datasets[0].df
+    df_mon = datasets[1].df
+
+    # Get unique classes
+    classes = sorted(df_ref[datasets[0].target_column].unique())
+
+    # Create subplots with more horizontal space for legends
+    fig = make_subplots(
+        rows=len(classes),
+        cols=1,
+        subplot_titles=[f"Class {cls}" for cls in classes],
+        horizontal_spacing=0.15,
+    )
+
+    # Define colors
+    ref_color = "rgba(31, 119, 180, 0.8)"  # Blue with 0.8 opacity
+    mon_color = "rgba(255, 127, 14, 0.8)"  # Orange with 0.8 opacity
+
+    # Dictionary to store tables for each class
+    tables = {}
+    all_passed = True  # Track overall pass/fail
+
+    # Add histograms and create tables for each class
+    for i, class_value in enumerate(classes, start=1):
+        # Get scores for current class
+        ref_scores = df_ref[df_ref[datasets[0].target_column] == class_value][
+            score_column
+        ]
+        mon_scores = df_mon[df_mon[datasets[1].target_column] == class_value][
+            score_column
+        ]
+
+        # Calculate distribution moments
+        ref_stats = {
+            "Mean": np.mean(ref_scores),
+            "Variance": np.var(ref_scores),
+            "Skewness": stats.skew(ref_scores),
+            "Kurtosis": stats.kurtosis(ref_scores),
+        }
+
+        mon_stats = {
+            "Mean": np.mean(mon_scores),
+            "Variance": np.var(mon_scores),
+            "Skewness": stats.skew(mon_scores),
+            "Kurtosis": stats.kurtosis(mon_scores),
+        }
+
+        # Create table for this class
+        table_data = []
+        class_passed = True  # Track pass/fail for this class
+
+        for stat_name in ["Mean", "Variance", "Skewness", "Kurtosis"]:
+            ref_val = ref_stats[stat_name]
+            mon_val = mon_stats[stat_name]
+            drift = (
+                ((mon_val - ref_val) / abs(ref_val)) * 100 if ref_val != 0 else np.inf
+            )
+            passed = abs(drift) < drift_pct_threshold
+            class_passed &= passed  # Update class pass/fail
+
+            table_data.append(
+                {
+                    "Statistic": stat_name,
+                    "Reference": round(ref_val, 4),
+                    "Monitoring": round(mon_val, 4),
+                    "Drift (%)": round(drift, 2),
+                    "Pass/Fail": "Pass" if passed else "Fail",
+                }
+            )
+
+        tables[f"Class {class_value}"] = pd.DataFrame(table_data)
+        all_passed &= class_passed  # Update overall pass/fail
+
+        # Reference dataset histogram
+        fig.add_trace(
+            go.Histogram(
+                x=ref_scores,
+                name=f"Reference - Class {class_value}",
+                marker_color=ref_color,
+                showlegend=True,
+                legendrank=i * 2 - 1,
+            ),
+            row=i,
+            col=1,
+        )
+
+        # Monitoring dataset histogram
+        fig.add_trace(
+            go.Histogram(
+                x=mon_scores,
+                name=f"Monitoring - Class {class_value}",
+                marker_color=mon_color,
+                showlegend=True,
+                legendrank=i * 2,
+            ),
+            row=i,
+            col=1,
+        )
+
+    # Update layout
+    fig.update_layout(
+        title_text=title,
+        barmode="overlay",
+        height=300 * len(classes),
+        width=1000,
+        showlegend=True,
+    )
+
+    # Update axes labels and add separate legends for each subplot
+    for i in range(len(classes)):
+        fig.update_xaxes(title_text="Score", row=i + 1, col=1)
+        fig.update_yaxes(title_text="Frequency", row=i + 1, col=1)
+
+        # Add separate legend for each subplot
+        fig.update_layout(
+            **{
+                f'legend{i+1 if i > 0 else ""}': dict(
+                    yanchor="middle",
+                    y=1 - (i / len(classes)) - (0.5 / len(classes)),
+                    xanchor="left",
+                    x=1.05,
+                    tracegroupgap=5,
+                )
+            }
+        )
+
+    return fig, tables, all_passed

From 7076ab54d4498cb93ed9d909ad7054215440a33d Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 10 Jan 2025 13:04:38 +0100
Subject: [PATCH 03/14] Update ongoing monitoring notebook

---
 .../application_scorecard_with_ml.ipynb       |   8 +-
 ...ication_scorecard_ongoing_monitoring.ipynb | 483 +++++++++++++++---
 .../data_validation/HighPearsonCorrelation.py |  14 +-
 .../ongoing_monitoring/ClassImbalanceDrift.py | 117 +++++
 4 files changed, 557 insertions(+), 65 deletions(-)
 create mode 100644 validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py

diff --git a/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb b/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb
index 03a6180b8..e94859df7 100644
--- a/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb
+++ b/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb
@@ -145,9 +145,9 @@
     "\n",
     "vm.init(\n",
     "  api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n",
-    "  api_key = \"...\",\n",
-    "  api_secret = \"...\",\n",
-    "  model = \"...\"\n",
+    "  api_key = \"f3e49f241081145facbbf59e93bcd8a9\",\n",
+    "  api_secret = \"c8dae73c5cc063cd070fa19508e625f60fe6dd18dddf96afed0d932ded91f530\",\n",
+    "  model = \"cm5fdpjre0lhq29iapn0uuwiw\"\n",
     ")"
    ]
   },
@@ -606,7 +606,7 @@
     "\n",
     "    For each metric in the test results, include in the test overview:\n",
     "    - The metric's purpose and what it measures\n",
-    "    - Its mathematical formula in LaTeX notation\n",
+    "    - Its mathematical formula\n",
     "    - The range of possible values\n",
     "    - What constitutes good/bad performance\n",
     "    - How to interpret different values\n",
diff --git a/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb b/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
index 3481b342a..4940ee53c 100644
--- a/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
+++ b/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
@@ -6,7 +6,65 @@
    "source": [
     "# Ongoing Monitoring for Application Scorecard \n",
     "\n",
-    "TBC."
+    "In this notebook, you'll learn how to seamlessly monitor your production models using the ValidMind Platform.\n",
+    "\n",
+    "We'll walk you through the process of initializing the ValidMind Library, loading a sample dataset and model, and running a monitoring test suite to quickly generate documentation about your new data and model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## About ValidMind\n",
+    "\n",
+    "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n",
+    "\n",
+    "You use the ValidMind Library to automate documentation, validation, monitoring tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n",
+    "\n",
+    "<a id='toc1_1_'></a>\n",
+    "\n",
+    "### Before you begin\n",
+    "\n",
+    "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n",
+    "\n",
+    "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n",
+    "\n",
+    "<a id='toc1_2_'></a>\n",
+    "\n",
+    "### New to ValidMind?\n",
+    "\n",
+    "If you haven't already seen our [Get started with the ValidMind Library](https://docs.validmind.ai/developer/get-started-validmind-library.html), we recommend you explore the available resources for developers at some point. There, you can learn more about documenting models, find code samples, or read our developer reference.\n",
+    "\n",
+    "<div class=\"alert alert-block alert-info\" style=\"background-color: #B5B5B510; color: black; border: 1px solid #083E44; border-left-width: 5px; box-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);border-radius: 5px;\"><span style=\"color: #083E44;\"><b>For access to all features available in this notebook, create a free ValidMind account.</b></span>\n",
+    "<br></br>\n",
+    "Signing up is FREE — <a href=\"https://docs.validmind.ai/guide/configuration/register-with-validmind.html\" style=\"color: #DE257E;\"><b>Register with ValidMind</b></a></div>\n",
+    "\n",
+    "<a id='toc1_3_'></a>\n",
+    "\n",
+    "### Key concepts\n",
+    "\n",
+    "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n",
+    "\n",
+    "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n",
+    "\n",
+    "**Model monitoring documentation**: A comprehensive and structured record of a production model, including key elements such as data sources, inputs, performance metrics, and periodic evaluations. This documentation ensures transparency and visibility of the model's performance in the production environment.\n",
+    "\n",
+    "**Monitoring documentation template**: Similar to documentation template, The monitoring documentation template functions as a test suite and lays out the structure of model monitoring documentation, segmented into various sections and sub-sections. Monitoring documentation templates define the structure of your model monitoring documentation, specifying the tests that should be run, and how the results should be displayed.\n",
+    "\n",
+    "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n",
+    "\n",
+    "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n",
+    "\n",
+    "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n",
+    "\n",
+    "  - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n",
+    "  - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n",
+    "  - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n",
+    "  - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n",
+    "\n",
+    "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n",
+    "\n",
+    "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures."
    ]
   },
   {
@@ -24,8 +82,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#%pip install -q validmind\n",
-    "%pip install -q -e ../../../../developer-framework"
+    "%pip install -q validmind"
    ]
   },
   {
@@ -75,9 +132,9 @@
     "\n",
     "vm.init(\n",
     "  api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n",
-    "  api_key = \"f3e49f241081145facbbf59e93bcd8a9\",\n",
-    "  api_secret = \"c8dae73c5cc063cd070fa19508e625f60fe6dd18dddf96afed0d932ded91f530\",\n",
-    "  model = \"cm5gljv9100021nignfpbkvvc\",\n",
+    "  api_key = \"...\",\n",
+    "  api_secret = \"...\",\n",
+    "  model = \"...\",\n",
     "  monitoring = True\n",
     ")"
    ]
@@ -99,8 +156,14 @@
    "source": [
     "import xgboost as xgb\n",
     "\n",
+    "from datetime import datetime, timedelta\n",
+    "\n",
     "from validmind.tests import run_test\n",
     "from validmind.datasets.credit_risk import lending_club\n",
+    "from validmind.unit_metrics import list_metrics\n",
+    "from validmind.unit_metrics import describe_metric\n",
+    "from validmind.unit_metrics import run_metric\n",
+    "from validmind.api_client import log_metric\n",
     "\n",
     "%matplotlib inline"
    ]
@@ -248,24 +311,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "vm_raw_dataset = vm.init_dataset(\n",
-    "    dataset=df,\n",
-    "    input_id=\"raw_dataset\",\n",
-    "    target_column=lending_club.target_column,\n",
-    ")\n",
-    "\n",
-    "vm_preprocess_dataset = vm.init_dataset(\n",
-    "    dataset=preprocess_df,\n",
-    "    input_id=\"preprocess_dataset\",\n",
-    "    target_column=lending_club.target_column,\n",
-    ")\n",
-    "\n",
-    "vm_fe_dataset = vm.init_dataset(\n",
-    "    dataset=fe_df,\n",
-    "    input_id=\"fe_dataset\",\n",
-    "    target_column=lending_club.target_column,\n",
-    ")\n",
-    "\n",
     "vm_reference_ds = vm.init_dataset(\n",
     "    dataset=train_df,\n",
     "    input_id=\"reference_dataset\",\n",
@@ -402,7 +447,7 @@
     "\n",
     "    For each metric in the test results, include in the test overview:\n",
     "    - The metric's purpose and what it measures\n",
-    "    - Its mathematical formula in LaTeX notation\n",
+    "    - Its mathematical formula\n",
     "    - The range of possible values\n",
     "    - What constitutes good/bad performance\n",
     "    - How to interpret different values\n",
@@ -433,7 +478,151 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Conduct target and feature drift testing\n",
+    "### Monitoring data description\n",
+    "\n",
+    "The Monitoring Data Description tests aim to provide a comprehensive statistical analysis of the monitoring dataset's characteristics. These tests examine the basic statistical properties, identify any missing data patterns, assess data uniqueness, visualize numerical feature distributions, and evaluate feature relationships through correlation analysis.\n",
+    "\n",
+    "The primary objective is to establish a baseline understanding of the monitoring data's structure and quality, enabling the detection of any significant deviations from expected patterns that could impact model performance. Each test is designed to capture different aspects of the data, from univariate statistics to multivariate relationships, providing a foundation for ongoing data quality assessment in the production environment."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.data_validation.DescriptiveStatistics:monitoring_data\",\n",
+    "        inputs={\n",
+    "            \"dataset\": vm_monitoring_ds,\n",
+    "        }\n",
+    "    ).log()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.data_validation.MissingValues:monitoring_data\",\n",
+    "        inputs={\n",
+    "            \"dataset\": vm_monitoring_ds,\n",
+    "        },\n",
+    "        params={\n",
+    "            \"min_threshold\": 1\n",
+    "        }\n",
+    "    ).log()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.data_validation.UniqueRows:monitoring_data\",\n",
+    "        inputs={\n",
+    "            \"dataset\": vm_monitoring_ds,\n",
+    "        },\n",
+    "        params={\n",
+    "            \"min_percent_threshold\": 1\n",
+    "        }\n",
+    "    ).log()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.data_validation.TabularNumericalHistograms:monitoring_data\",\n",
+    "        inputs={\n",
+    "            \"dataset\": vm_monitoring_ds,\n",
+    "        },\n",
+    "    ).log()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.data_validation.PearsonCorrelationMatrix:monitoring_data\",\n",
+    "        inputs={\n",
+    "            \"dataset\": vm_monitoring_ds,\n",
+    "        }\n",
+    "    ).log()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=True\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.data_validation.HighPearsonCorrelation:monitoring_data\",\n",
+    "        inputs={\n",
+    "            \"dataset\": vm_monitoring_ds,\n",
+    "        },\n",
+    "        params={\n",
+    "            \"feature_columns\": vm_monitoring_ds.feature_columns,\n",
+    "            \"max_threshold\": 0.5,\n",
+    "            \"top_n_correlations\": 10\n",
+    "        }\n",
+    "    ).log()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.ongoing_monitoring.ClassImbalanceDrift\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        },\n",
+    "        params={\n",
+    "            \"drift_pct_threshold\": 1\n",
+    "        },\n",
+    "    ).log()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Target and feature drift\n",
     "\n",
     "Next, the goal is to investigate the distributional characteristics of predictions and features to determine if the underlying data has changed. These tests are crucial for assessing the expected accuracy of the model.\n",
     "\n",
@@ -450,7 +639,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run=False\n",
+    "if run:\n",
+    "\n",
+    "    run_test(\n",
+    "        \"validmind.model_validation.sklearn.PopulationStabilityIndex\",\n",
+    "        inputs={\n",
+    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "            \"model\": vm_xgb_model,\n",
+    "        },\n",
+    "    ).log()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -466,7 +673,7 @@
     "        params={\n",
     "            \"drift_pct_threshold\": 5\n",
     "        },\n",
-    "    )"
+    "    ).log()"
    ]
   },
   {
@@ -478,7 +685,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -494,7 +701,7 @@
     "        params={\n",
     "            \"drift_pct_threshold\": 5\n",
     "        },\n",
-    "    )"
+    "    ).log()"
    ]
   },
   {
@@ -506,7 +713,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -519,21 +726,19 @@
     "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
     "            \"model\": vm_xgb_model,\n",
     "        },\n",
-    "    )"
+    "    ).log()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Feature drift tests\n",
-    "\n",
     "Next, let's add run a test to investigate how or if the features have drifted. In this instance we want to compare the training data with prediction data."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -549,19 +754,23 @@
     "        params={\n",
     "            \"psi_threshold\": 0.2,\n",
     "        },\n",
-    "    )\n"
+    "    ).log()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Classification accuracy"
+    "### Classification accuracy\n",
+    "\n",
+    "We now evaluate the model's predictive performance by comparing its behavior between reference and monitoring datasets. These tests analyze shifts in overall accuracy metrics, examine changes in the confusion matrix to identify specific classification pattern changes, and assess the model's probability calibration across different prediction thresholds. \n",
+    "\n",
+    "The primary objective is to detect any degradation in the model's classification performance that might indicate reliability issues in production. The tests provide both aggregate performance metrics and detailed breakdowns of prediction patterns, enabling the identification of specific areas where the model's accuracy might be deteriorating."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -577,12 +786,12 @@
     "        params={\n",
     "            \"drift_pct_threshold\": 5,\n",
     "        },\n",
-    "    )\n"
+    "    ).log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -598,12 +807,12 @@
     "        params={\n",
     "            \"drift_pct_threshold\": 5,\n",
     "        },\n",
-    "    )"
+    "    ).log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -620,19 +829,23 @@
     "            \"n_bins\": 10,\n",
     "            \"drift_pct_threshold\": 10,\n",
     "        },\n",
-    "    )"
+    "    ).log()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Class discrimination"
+    "### Class discrimination\n",
+    "\n",
+    "The following tests assess the model's ability to effectively separate different classes in both reference and monitoring datasets. These tests analyze the model's discriminative power by examining the separation between class distributions, evaluating changes in the ROC curve characteristics, comparing probability distribution patterns, and assessing cumulative prediction trends. \n",
+    "\n",
+    "The primary objective is to identify any deterioration in the model's ability to distinguish between classes, which could indicate a decline in model effectiveness. The tests examine both the overall discriminative capability and the granular patterns in prediction distributions, providing insights into whether the model maintains its ability to effectively differentiate between classes in the production environment."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -648,12 +861,12 @@
     "        params={\n",
     "            \"drift_pct_threshold\": 5,\n",
     "        },\n",
-    "    )"
+    "    ).log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -666,12 +879,12 @@
     "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
     "            \"model\": vm_xgb_model,\n",
     "        }\n",
-    "    )"
+    "    ).log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -687,12 +900,12 @@
     "        params={\n",
     "            \"drift_pct_threshold\": 10,\n",
     "        },\n",
-    "    )"
+    "    ).log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -705,23 +918,27 @@
     "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
     "            \"model\": vm_xgb_model,\n",
     "        }\n",
-    "    )"
+    "    ).log()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Scoring"
+    "### Scoring\n",
+    "\n",
+    "Next we analyze the distribution and stability of credit scores across reference and monitoring datasets. These tests evaluate shifts in score distributions, examine changes in score band populations, and assess the relationship between scores and default rates. \n",
+    "\n",
+    "The primary objective is to identify any significant changes in how the model assigns credit scores, which could indicate drift in risk assessment capabilities. The tests examine both the overall score distribution patterns and the specific performance within defined score bands, providing insights into whether the model maintains consistent and reliable risk segmentation."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
+    "run=False\n",
     "if run:\n",
     "\n",
     "    run_test(\n",
@@ -733,16 +950,16 @@
     "            \"score_column\": \"xgb_scores\",\n",
     "            \"drift_pct_threshold\": 20,\n",
     "        },\n",
-    "    )\n"
+    "    ).log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
+    "run=False\n",
     "if run:\n",
     "\n",
     "    run_test(\n",
@@ -756,7 +973,155 @@
     "            \"score_bands\": [500, 540, 570],\n",
     "            \"drift_pct_threshold\": 20,\n",
     "        },\n",
-    "    )"
+    "    ).log()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Performance history\n",
+    "\n",
+    "In this section we showcase how to track and visualize the temporal evolution of key model performance metrics, including AUC, F1 score, precision, recall, and accuracy. For demonstration purposes, the section simulates historical performance data by introducing a gradual downward trend and random noise to these metrics over a specified time period. These tests are useful for analyzing the stability and trends in model performance indicators, helping to identify potential degradation or unexpected fluctuations in model behavior over time. \n",
+    "\n",
+    "The main goal is to maintain a continuous record of model performance that can be used to detect gradual drift, sudden changes, or cyclical patterns in model effectiveness. This temporal monitoring approach provides early warning signals of potential issues and helps establish whether the model maintains consistent performance within acceptable boundaries throughout its deployment period."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metrics = [metric for metric in list_metrics() if \"classification\" in metric]\n",
+    "\n",
+    "for metric_id in metrics:\n",
+    "    describe_metric(metric_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = run_metric(\n",
+    "    \"validmind.unit_metrics.classification.ROC_AUC\",\n",
+    "    inputs={\n",
+    "        \"model\": vm_xgb_model,\n",
+    "        \"dataset\": vm_monitoring_ds,\n",
+    "    },\n",
+    ")\n",
+    "auc = result.metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = run_metric(\n",
+    "    \"validmind.unit_metrics.classification.Accuracy\",\n",
+    "    inputs={\n",
+    "        \"model\": vm_xgb_model,\n",
+    "        \"dataset\": vm_monitoring_ds,\n",
+    "    },\n",
+    ")\n",
+    "accuracy = result.metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = run_metric(\n",
+    "    \"validmind.unit_metrics.classification.Recall\",\n",
+    "    inputs={\n",
+    "        \"model\": vm_xgb_model,\n",
+    "        \"dataset\": vm_monitoring_ds,\n",
+    "    },\n",
+    ")\n",
+    "recall = result.metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f1 = run_metric(\n",
+    "    \"validmind.unit_metrics.classification.F1\",\n",
+    "    inputs={\n",
+    "        \"model\": vm_xgb_model,\n",
+    "        \"dataset\": vm_monitoring_ds,\n",
+    "    },\n",
+    ")\n",
+    "f1 = result.metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "precision = run_metric(\n",
+    "    \"validmind.unit_metrics.classification.Precision\",\n",
+    "    inputs={\n",
+    "        \"model\": vm_xgb_model,\n",
+    "        \"dataset\": vm_monitoring_ds,\n",
+    "    },\n",
+    ")\n",
+    "precision = result.metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "NUM_DAYS = 10\n",
+    "base_date = datetime.now() - timedelta(days=NUM_DAYS)\n",
+    "\n",
+    "# Initial values\n",
+    "performance_metrics = {\n",
+    "    \"AUC Score\": auc,\n",
+    "    \"F1 Score\": f1,\n",
+    "    \"Precision Score\": precision,\n",
+    "    \"Recall Score\": recall,\n",
+    "    \"Accuracy Score\": accuracy\n",
+    "}\n",
+    "\n",
+    "# Trend parameters\n",
+    "trend_factor = 0.98  # Slight downward trend (multiply by 0.98 each step)\n",
+    "noise_scale = 0.02   # Random fluctuation of ±2%\n",
+    "\n",
+    "\n",
+    "for i in range(NUM_DAYS):\n",
+    "    recorded_at = base_date + timedelta(days=i)\n",
+    "    print(f\"\\nrecorded_at: {recorded_at}\")\n",
+    "\n",
+    "    # Log each metric with trend and noise\n",
+    "    for metric_name, base_value in performance_metrics.items():\n",
+    "        # Apply trend and add random noise\n",
+    "        trend = base_value * (trend_factor ** i)\n",
+    "        noise = np.random.normal(0, noise_scale * base_value)\n",
+    "        value = max(0, min(1, trend + noise))  # Ensure value stays between 0 and 1\n",
+    "        \n",
+    "        log_metric(\n",
+    "            key=metric_name,\n",
+    "            value=value,\n",
+    "            recorded_at=recorded_at.isoformat()\n",
+    "        )\n",
+    "        \n",
+    "        print(f\"{metric_name:<15}: {value:.4f}\")\n"
    ]
   }
  ],
diff --git a/validmind/tests/data_validation/HighPearsonCorrelation.py b/validmind/tests/data_validation/HighPearsonCorrelation.py
index 5be563185..9169a58ee 100644
--- a/validmind/tests/data_validation/HighPearsonCorrelation.py
+++ b/validmind/tests/data_validation/HighPearsonCorrelation.py
@@ -9,7 +9,10 @@
 @tags("tabular_data", "data_quality", "correlation")
 @tasks("classification", "regression")
 def HighPearsonCorrelation(
-    dataset: VMDataset, max_threshold: float = 0.3, top_n_correlations: int = 10
+    dataset: VMDataset,
+    max_threshold: float = 0.3,
+    top_n_correlations: int = 10,
+    feature_columns: list = None,
 ):
     """
     Identifies highly correlated feature pairs in a dataset suggesting feature redundancy or multicollinearity.
@@ -51,8 +54,15 @@ def HighPearsonCorrelation(
     - Limited to identifying redundancy only within feature pairs; may fail to spot more complex relationships among
     three or more variables.
     """
+
+    # Select features
+    if feature_columns is None:
+        df = dataset.df
+    else:
+        df = dataset.df[feature_columns]
+
     # Get correlation matrix for numeric columns
-    corr = dataset.df.corr(numeric_only=True)
+    corr = df.corr(numeric_only=True)
 
     # Create table of correlation coefficients and column pairs
     pairs = []
diff --git a/validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py b/validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py
new file mode 100644
index 000000000..aa604f611
--- /dev/null
+++ b/validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py
@@ -0,0 +1,117 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+import pandas as pd
+import plotly.graph_objs as go
+from typing import List, Tuple
+from validmind import tags, tasks
+from validmind.vm_models import VMDataset
+from validmind.errors import SkipTestError
+
+
+@tags("tabular_data", "binary_classification", "multiclass_classification")
+@tasks("classification")
+def ClassImbalanceDrift(
+    datasets: List[VMDataset],
+    drift_pct_threshold: float = 5.0,
+    title: str = "Class Distribution Drift",
+) -> Tuple[go.Figure, dict, bool]:
+    """
+    Evaluates drift in class distribution between reference and monitoring datasets.
+
+    ### Purpose
+    This test compares the class distribution between two datasets to identify
+    potential population drift in the target variable.
+
+    ### Test Mechanism
+    - Calculates class percentages for both datasets
+    - Computes drift as the difference in percentages
+    - Visualizes distributions side by side
+    - Flags significant changes in class proportions
+
+    ### Signs of High Risk
+    - Large shifts in class proportions
+    - New classes appearing or existing classes disappearing
+    - Multiple classes showing significant drift
+    - Systematic shifts across multiple classes
+    """
+    # Validate inputs
+    if not datasets[0].target_column or not datasets[1].target_column:
+        raise SkipTestError("No target column provided")
+
+    # Calculate class distributions
+    ref_dist = (
+        datasets[0].df[datasets[0].target_column].value_counts(normalize=True) * 100
+    )
+    mon_dist = (
+        datasets[1].df[datasets[1].target_column].value_counts(normalize=True) * 100
+    )
+
+    # Get all unique classes
+    all_classes = sorted(set(ref_dist.index) | set(mon_dist.index))
+
+    if len(all_classes) > 10:
+        raise SkipTestError("Skipping target column with more than 10 classes")
+
+    # Create comparison table
+    rows = []
+    all_passed = True
+
+    for class_label in all_classes:
+        ref_percent = ref_dist.get(class_label, 0)
+        mon_percent = mon_dist.get(class_label, 0)
+
+        # Calculate drift (preserving sign)
+        drift = mon_percent - ref_percent
+        passed = abs(drift) < drift_pct_threshold
+        all_passed &= passed
+
+        rows.append(
+            {
+                datasets[0].target_column: class_label,
+                "Reference (%)": round(ref_percent, 4),
+                "Monitoring (%)": round(mon_percent, 4),
+                "Drift (%)": round(drift, 4),
+                "Pass/Fail": "Pass" if passed else "Fail",
+            }
+        )
+
+    comparison_df = pd.DataFrame(rows)
+
+    # Create named tables dictionary
+    tables = {"Class Distribution (%)": comparison_df}
+
+    # Create visualization
+    fig = go.Figure()
+
+    # Add reference distribution bar
+    fig.add_trace(
+        go.Bar(
+            name="Reference",
+            x=[str(c) for c in all_classes],
+            y=comparison_df["Reference (%)"],
+            marker_color="rgba(31, 119, 180, 0.8)",  # Blue with 0.8 opacity
+        )
+    )
+
+    # Add monitoring distribution bar
+    fig.add_trace(
+        go.Bar(
+            name="Monitoring",
+            x=[str(c) for c in all_classes],
+            y=comparison_df["Monitoring (%)"],
+            marker_color="rgba(255, 127, 14, 0.8)",  # Orange with 0.8 opacity
+        )
+    )
+
+    # Update layout
+    fig.update_layout(
+        title=title,
+        xaxis_title="Class",
+        yaxis_title="Percentage (%)",
+        barmode="group",
+        showlegend=True,
+    )
+
+    return fig, tables, all_passed

From 942e45358933189e40f0d14e2ab0b4aef70ff830 Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 10 Jan 2025 13:07:24 +0100
Subject: [PATCH 04/14] Fix lint

---
 .../CumulativePredictionProbabilitiesDrift.py                 | 4 +---
 .../tests/ongoing_monitoring/PredictionProbabilitiesDrift.py  | 1 -
 validmind/tests/ongoing_monitoring/ScoreBandsDrift.py         | 2 +-
 .../ongoing_monitoring/TargetPredictionDistributionPlot.py    | 1 -
 4 files changed, 2 insertions(+), 6 deletions(-)
 delete mode 100644 validmind/tests/ongoing_monitoring/PredictionProbabilitiesDrift.py

diff --git a/validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py b/validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py
index 6d05b7624..6d6c228f8 100644
--- a/validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py
+++ b/validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py
@@ -5,7 +5,7 @@
 import numpy as np
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
-from typing import List, Tuple
+from typing import List
 from validmind import tags, tasks
 from validmind.vm_models import VMDataset, VMModel
 
@@ -35,12 +35,10 @@ def CumulativePredictionProbabilitiesDrift(
     - Systematic differences across probability ranges
     """
     # Get predictions and true values
-    y_true_ref = datasets[0].y
     y_prob_ref = datasets[0].y_prob(model)
     df_ref = datasets[0].df.copy()
     df_ref["probabilities"] = y_prob_ref
 
-    y_true_mon = datasets[1].y
     y_prob_mon = datasets[1].y_prob(model)
     df_mon = datasets[1].df.copy()
     df_mon["probabilities"] = y_prob_mon
diff --git a/validmind/tests/ongoing_monitoring/PredictionProbabilitiesDrift.py b/validmind/tests/ongoing_monitoring/PredictionProbabilitiesDrift.py
deleted file mode 100644
index 0519ecba6..000000000
--- a/validmind/tests/ongoing_monitoring/PredictionProbabilitiesDrift.py
+++ /dev/null
@@ -1 +0,0 @@
- 
\ No newline at end of file
diff --git a/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py b/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py
index e4c9b4ad5..8599c6894 100644
--- a/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py
+++ b/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py
@@ -4,7 +4,7 @@
 
 import pandas as pd
 import numpy as np
-from typing import List, Dict, Tuple
+from typing import List
 from validmind import tags, tasks
 from validmind.vm_models import VMDataset, VMModel
 
diff --git a/validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py b/validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py
index f99bf97e2..c6cce82c7 100644
--- a/validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py
+++ b/validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py
@@ -5,7 +5,6 @@
 import plotly.graph_objects as go
 import plotly.figure_factory as ff
 import pandas as pd
-import numpy as np
 from scipy.stats import skew, kurtosis
 from validmind import tags, tasks
 

From d9d9b441b700eda388019fd5d4c852675546b566 Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 10 Jan 2025 15:15:14 +0100
Subject: [PATCH 05/14] Add scorecard executive notebook

---
 .../application_scorecard_executive.ipynb     | 264 ++++++++++++
 .../application_scorecard_full_suite.ipynb    |   2 +-
 .../datasets/credit_risk/lending_club.py      | 397 ++++++++++++++----
 validmind/vm_models/dataset/dataset.py        |   4 -
 4 files changed, 574 insertions(+), 93 deletions(-)
 create mode 100644 notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb

diff --git a/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb b/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb
new file mode 100644
index 000000000..3abde0eb3
--- /dev/null
+++ b/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb
@@ -0,0 +1,264 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Document an application scorecard model\n",
+    "\n",
+    "Build and document an *application scorecard model* with the ValidMind Library by using Kaggle's [Lending Club](https://www.kaggle.com/datasets/devanshi23/loan-data-2007-2014/data) sample dataset to build a simple application scorecard.\n",
+    "\n",
+    "An application scorecard model is a type of statistical model used in credit scoring to evaluate the creditworthiness of potential borrowers by generating a score based on various characteristics of an applicant — such as credit history, income, employment status, and other relevant financial data. \n",
+    "\n",
+    "- This score helps lenders make decisions about whether to approve or reject loan applications, as well as determine the terms of the loan, including interest rates and credit limits. \n",
+    "- Application scorecard models enable lenders to manage risk efficiently while making the loan application process faster and more transparent for applicants.\n",
+    "\n",
+    "This interactive notebook provides a step-by-step guide for loading a demo dataset, preprocessing the raw data, training a model for testing, setting up test inputs, initializing the required ValidMind objects, running the test, and then logging the results to ValidMind."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a id='toc1_'></a>\n",
+    "\n",
+    "## About ValidMind\n",
+    "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n",
+    "\n",
+    "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n",
+    "\n",
+    "<a id='toc1_1_'></a>\n",
+    "\n",
+    "### Before you begin\n",
+    "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language.\n",
+    "\n",
+    "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n",
+    "\n",
+    "<a id='toc1_2_'></a>\n",
+    "\n",
+    "### New to ValidMind?\n",
+    "If you haven't already seen our [Get started with the ValidMind Library](https://docs.validmind.ai/developer/get-started-validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models, find code samples, or read our developer reference.\n",
+    "\n",
+    "<div class=\"alert alert-block alert-info\" style=\"background-color: #B5B5B510; color: black; border: 1px solid #083E44; border-left-width: 5px; box-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);border-radius: 5px;\"><span style=\"color: #083E44;\"><b>For access to all features available in this notebook, create a free ValidMind account.</b></span>\n",
+    "<br></br>\n",
+    "Signing up is FREE — <a href=\"https://docs.validmind.ai/guide/configuration/register-with-validmind.html\" style=\"color: #DE257E;\"><b>Register with ValidMind</b></a></div>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a id='toc1_3_'></a>\n",
+    "\n",
+    "### Key concepts\n",
+    "\n",
+    "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n",
+    "\n",
+    "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n",
+    "\n",
+    "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n",
+    "\n",
+    "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n",
+    "\n",
+    "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n",
+    "\n",
+    "- **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n",
+    "- **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n",
+    "- **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n",
+    "- **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/run_tests_that_require_multiple_datasets.html) for more information.\n",
+    "\n",
+    "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n",
+    "\n",
+    "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n",
+    "\n",
+    "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n",
+    "\n",
+    "Example: The [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a id='toc2_'></a>\n",
+    "\n",
+    "## Install the ValidMind Library\n",
+    "\n",
+    "To install the library:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -q validmind"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a id='toc3_'></a>\n",
+    "\n",
+    "## Initialize the ValidMind Library\n",
+    "\n",
+    "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n",
+    "\n",
+    "<a id='toc3_1_'></a>\n",
+    "\n",
+    "### Get your code snippet\n",
+    "\n",
+    "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n",
+    "\n",
+    "2. In the left sidebar, navigate to **Model Inventory** and click **+ Register Model**.\n",
+    "\n",
+    "3. Enter the model details and click **Continue**. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n",
+    "\n",
+    "   For example, to register a model for use with this notebook, select:\n",
+    "\n",
+    "   - Documentation template: `Credit Risk Scorecard`\n",
+    "   - Use case: `Credit Risk - CECL`\n",
+    "\n",
+    "   You can fill in other options according to your preference.\n",
+    "\n",
+    "4. Go to **Getting Started** and click **Copy snippet to clipboard**.\n",
+    "\n",
+    "Next, replace the placeholder with your own code snippet:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import validmind as vm\n",
+    "\n",
+    "vm.init(\n",
+    "  api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n",
+    "  api_key = \"...\",\n",
+    "  api_secret = \"...\",\n",
+    "  model = \"...\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a id='toc7_'></a>\n",
+    "\n",
+    "## Document the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from validmind.datasets.credit_risk import lending_club\n",
+    "\n",
+    "lending_club.document_model()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a id='toc8_'></a>\n",
+    "\n",
+    "## Next steps\n",
+    "\n",
+    "You can look at the results of this test suite right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation.\n",
+    "\n",
+    "<a id='toc8_1_'></a>\n",
+    "\n",
+    "### Work with your model documentation\n",
+    "\n",
+    "1. In the ValidMind Platform, go to the **Documentation** page for the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-documentation/working-with-model-documentation.html))\n",
+    "\n",
+    "2. Expand the following sections and take a look around:\n",
+    "\n",
+    "   - **2. Data Preparation**\n",
+    "   - **3. Model Development**\n",
+    "\n",
+    "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation (hint: some of the tests in **2.3. Feature Selection and Engineering** look like they need some attention), view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready.\n",
+    "\n",
+    "<a id='toc8_2_'></a>\n",
+    "\n",
+    "### Discover more learning resources\n",
+    "\n",
+    "We offer many interactive notebooks to help you document models:\n",
+    "\n",
+    "- [Run tests & test suites](https://docs.validmind.ai/developer/model-testing/testing-overview.html)\n",
+    "- [Code samples](https://docs.validmind.ai/developer/samples-jupyter-notebooks.html)\n",
+    "\n",
+    "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a id='toc9_'></a>\n",
+    "\n",
+    "## Upgrade ValidMind\n",
+    "\n",
+    "<div class=\"alert alert-block alert-info\" style=\"background-color: #B5B5B510; color: black; border: 1px solid #083E44; border-left-width: 5px; box-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);border-radius: 5px;\">After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.</div>\n",
+    "\n",
+    "Retrieve the information for the currently installed version of ValidMind:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip show validmind"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n",
+    "\n",
+    "```bash\n",
+    "%pip install --upgrade validmind\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You may need to restart your kernel after running the upgrade package for changes to be applied."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "validmind-eEL8LtKG-py3.10",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb b/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb
index 1e956cc1a..750ebc967 100644
--- a/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb
+++ b/notebooks/code_samples/credit_risk/application_scorecard_full_suite.ipynb
@@ -605,7 +605,7 @@
     "\n",
     "    For each metric in the test results, include in the test overview:\n",
     "    - The metric's purpose and what it measures\n",
-    "    - Its mathematical formula in LaTeX notation\n",
+    "    - Its mathematical formula\n",
     "    - The range of possible values\n",
     "    - What constitutes good/bad performance\n",
     "    - How to interpret different values\n",
diff --git a/validmind/datasets/credit_risk/lending_club.py b/validmind/datasets/credit_risk/lending_club.py
index 133ea0702..72adc3d89 100644
--- a/validmind/datasets/credit_risk/lending_club.py
+++ b/validmind/datasets/credit_risk/lending_club.py
@@ -3,13 +3,20 @@
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
 import os
-
+import warnings
+import logging
 import numpy as np
 import pandas as pd
 import scorecardpy as sc
 import statsmodels.api as sm
+
+import xgboost as xgb
+import validmind as vm
+
+from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
 
+
 current_path = os.path.dirname(os.path.abspath(__file__))
 dataset_path = os.path.join(current_path, "datasets")
 
@@ -95,7 +102,7 @@
 }
 
 
-def load_data(source="online"):
+def load_data(source="online", verbose=True):
     """
     Load data from either an online source or offline files, automatically dropping specified columns for offline data.
 
@@ -104,28 +111,33 @@ def load_data(source="online"):
     """
 
     if source == "online":
-        print(f"Loading data from an online source: {online_data_file}")
+        if verbose:
+            print(f"Loading data from an online source: {online_data_file}")
         df = pd.read_csv(online_data_file)
-        df = _clean_data(df)
+        df = _clean_data(df, verbose=verbose)
 
     elif source == "offline":
-        print(f"Loading data from an offline .gz file: {offline_data_file}")
+        if verbose:
+            print(f"Loading data from an offline .gz file: {offline_data_file}")
         # Since we know the offline_data_file path ends with '.zip', we replace it with '.csv.gz'
         gzip_file_path = offline_data_file.replace(".zip", ".csv.gz")
-        print(f"Attempting to read from .gz file: {gzip_file_path}")
+        if verbose:
+            print(f"Attempting to read from .gz file: {gzip_file_path}")
         # Read the CSV file directly from the .gz archive
         df = pd.read_csv(gzip_file_path, compression="gzip")
-        print("Data loaded successfully.")
+        if verbose:
+            print("Data loaded successfully.")
     else:
         raise ValueError("Invalid source specified. Choose 'online' or 'offline'.")
 
-    print(
-        f"Rows: {df.shape[0]}, Columns: {df.shape[1]}, Missing values: {df.isnull().sum().sum()}"
-    )
+    if verbose:
+        print(
+            f"Rows: {df.shape[0]}, Columns: {df.shape[1]}, Missing values: {df.isnull().sum().sum()}"
+        )
     return df
 
 
-def _clean_data(df):
+def _clean_data(df, verbose=True):
     df = df.copy()
 
     # Drop columns not relevant for application scorecards
@@ -133,41 +145,45 @@ def _clean_data(df):
 
     # Drop rows with missing target values
     df.dropna(subset=[target_column], inplace=True)
-    print("Dropping rows with missing target values:")
-    print(
-        f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
-    )
+    if verbose:
+        print("Dropping rows with missing target values:")
+        print(
+            f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
+        )
 
     # Drop columns with more than N percent missing values
     missing_values = df.isnull().mean()
     df = df.loc[:, missing_values < 0.7]
-    print("Dropping columns with more than 70% missing values:")
-    print(
-        f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
-    )
+    if verbose:
+        print("Dropping columns with more than 70% missing values:")
+        print(
+            f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
+        )
 
     # Drop columns with only one unique value
     unique_values = df.nunique()
     df = df.loc[:, unique_values > 1]
-    print("Dropping columns with only one unique value:")
-    print(
-        f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
-    )
+    if verbose:
+        print("Dropping columns with only one unique value:")
+        print(
+            f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
+        )
 
     # Define the target variable for the model, representing loan default status.
     df[target_column] = df[target_column].map({"Fully Paid": 0, "Charged Off": 1})
 
     # Drop rows with NaN in target_column after mapping
     df.dropna(subset=[target_column], inplace=True)
-    print("Dropping rows with missing target values:")
-    print(
-        f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
-    )
+    if verbose:
+        print("Dropping rows with missing target values:")
+        print(
+            f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
+        )
 
     return df
 
 
-def preprocess(df):
+def preprocess(df, verbose=True):
     df = df.copy()
 
     # Convert the target variable to integer type for modeling.
@@ -175,45 +191,51 @@ def preprocess(df):
 
     # Keep rows where purpose is 'debt_consolidation' or 'credit_card'
     df = df[df["purpose"].isin(["debt_consolidation", "credit_card"])]
-    print("Filtering 'purpose' to 'debt_consolidation' and 'credit_card':")
-    print(
-        f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
-    )
+    if verbose:
+        print("Filtering 'purpose' to 'debt_consolidation' and 'credit_card':")
+        print(
+            f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
+        )
 
     # Remove rows where grade is 'F' or 'G'
     df = df[~df["grade"].isin(["F", "G"])]
-    print("Filtering out 'grade' F and G:")
-    print(
-        f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
-    )
+    if verbose:
+        print("Filtering out 'grade' F and G:")
+        print(
+            f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
+        )
 
     # Remove rows where sub_grade starts with 'F' or 'G'
     df = df[~df["sub_grade"].str.startswith(("F", "G"))]
-    print("Filtering out 'sub_grade' F and G:")
-    print(
-        f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
-    )
+    if verbose:
+        print("Filtering out 'sub_grade' F and G:")
+        print(
+            f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
+        )
 
     # Remove rows where home_ownership is 'OTHER', 'NONE', or 'ANY'
     df = df[~df["home_ownership"].isin(["OTHER", "NONE", "ANY"])]
-    print("Filtering out 'home_ownership' OTHER, NONE, ANY:")
-    print(
-        f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
-    )
+    if verbose:
+        print("Filtering out 'home_ownership' OTHER, NONE, ANY:")
+        print(
+            f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
+        )
 
     # Drop features that are not useful for modeling
     df.drop(drop_features, axis=1, inplace=True)
-    print("Dropping specified features:")
-    print(
-        f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
-    )
+    if verbose:
+        print("Dropping specified features:")
+        print(
+            f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
+        )
 
     # Drop rows with missing values
     df.dropna(inplace=True)
-    print("Dropping rows with any missing values:")
-    print(
-        f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
-    )
+    if verbose:
+        print("Dropping rows with any missing values:")
+        print(
+            f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
+        )
 
     # Preprocess emp_length column
     df = _preprocess_emp_length(df)
@@ -260,34 +282,37 @@ def _preprocess_emp_length(df):
     return df
 
 
-def feature_engineering(df):
+def feature_engineering(df, verbose=True):
     df = df.copy()
 
     # WoE encoding of numerical and categorical features
-    df = woe_encoding(df)
+    df = woe_encoding(df, verbose=verbose)
 
-    print(
-        f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
-    )
+    if verbose:
+        print(
+            f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
+        )
 
     return df
 
 
-def woe_encoding(df):
+def woe_encoding(df, verbose=True):
     df = df.copy()
 
-    woe = _woebin(df)
+    woe = _woebin(df, verbose=verbose)
     bins = _woe_to_bins(woe)
 
     # Make sure we don't transform the target column
     if target_column in bins:
         del bins[target_column]
-        print(f"Excluded {target_column} from WoE transformation.")
+        if verbose:
+            print(f"Excluded {target_column} from WoE transformation.")
 
     # Apply the WoE transformation
     df = sc.woebin_ply(df, bins=bins)
 
-    print("Successfully converted features to WoE values.")
+    if verbose:
+        print("Successfully converted features to WoE values.")
 
     return df
 
@@ -326,7 +351,7 @@ def _woe_to_bins(woe):
     return bins
 
 
-def _woebin(df):
+def _woebin(df, verbose=True):
     """
     This function performs automatic binning using WoE.
     df: A pandas dataframe
@@ -337,9 +362,10 @@ def _woebin(df):
     df[non_numeric_cols] = df[non_numeric_cols].astype(str)
 
     try:
-        print(
-            f"Performing binning with breaks_adj: {breaks_adj}"
-        )  # print the breaks_adj being used
+        if verbose:
+            print(
+                f"Performing binning with breaks_adj: {breaks_adj}"
+            )  # print the breaks_adj being used
         bins = sc.woebin(df, target_column, breaks_list=breaks_adj)
     except Exception as e:
         print("Error during binning: ")
@@ -355,7 +381,7 @@ def _woebin(df):
         return bins_df
 
 
-def split(df, validation_size=None, test_size=0.2, add_constant=False):
+def split(df, validation_size=None, test_size=0.2, add_constant=False, verbose=True):
     """
     Split dataset into train, validation (optional), and test sets.
 
@@ -384,15 +410,16 @@ def split(df, validation_size=None, test_size=0.2, add_constant=False):
             train_val_df = sm.add_constant(train_val_df)
 
         # Print details for two-way split
-        print("After splitting the dataset into training and test sets:")
-        print(
-            f"Training Dataset:\nRows: {train_val_df.shape[0]}\nColumns: {train_val_df.shape[1]}\n"
-            f"Missing values: {train_val_df.isnull().sum().sum()}\n"
-        )
-        print(
-            f"Test Dataset:\nRows: {test_df.shape[0]}\nColumns: {test_df.shape[1]}\n"
-            f"Missing values: {test_df.isnull().sum().sum()}\n"
-        )
+        if verbose:
+            print("After splitting the dataset into training and test sets:")
+            print(
+                f"Training Dataset:\nRows: {train_val_df.shape[0]}\nColumns: {train_val_df.shape[1]}\n"
+                f"Missing values: {train_val_df.isnull().sum().sum()}\n"
+            )
+            print(
+                f"Test Dataset:\nRows: {test_df.shape[0]}\nColumns: {test_df.shape[1]}\n"
+                f"Missing values: {test_df.isnull().sum().sum()}\n"
+            )
 
         return train_val_df, test_df
 
@@ -407,19 +434,20 @@ def split(df, validation_size=None, test_size=0.2, add_constant=False):
         validation_df = sm.add_constant(validation_df)
 
     # Print details for three-way split
-    print("After splitting the dataset into training, validation, and test sets:")
-    print(
-        f"Training Dataset:\nRows: {train_df.shape[0]}\nColumns: {train_df.shape[1]}\n"
-        f"Missing values: {train_df.isnull().sum().sum()}\n"
-    )
-    print(
-        f"Validation Dataset:\nRows: {validation_df.shape[0]}\nColumns: {validation_df.shape[1]}\n"
-        f"Missing values: {validation_df.isnull().sum().sum()}\n"
-    )
-    print(
-        f"Test Dataset:\nRows: {test_df.shape[0]}\nColumns: {test_df.shape[1]}\n"
-        f"Missing values: {test_df.isnull().sum().sum()}\n"
-    )
+    if verbose:
+        print("After splitting the dataset into training, validation, and test sets:")
+        print(
+            f"Training Dataset:\nRows: {train_df.shape[0]}\nColumns: {train_df.shape[1]}\n"
+            f"Missing values: {train_df.isnull().sum().sum()}\n"
+        )
+        print(
+            f"Validation Dataset:\nRows: {validation_df.shape[0]}\nColumns: {validation_df.shape[1]}\n"
+            f"Missing values: {validation_df.isnull().sum().sum()}\n"
+        )
+        print(
+            f"Test Dataset:\nRows: {test_df.shape[0]}\nColumns: {test_df.shape[1]}\n"
+            f"Missing values: {test_df.isnull().sum().sum()}\n"
+        )
 
     return train_df, validation_df, test_df
 
@@ -822,3 +850,196 @@ def get_demo_test_config(x_test=None, y_test=None):
     }
 
     return default_config
+
+
+def document_model():
+
+    warnings.filterwarnings("ignore")
+    logging.getLogger("scorecardpy").setLevel(logging.ERROR)
+
+    os.environ["VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED"] = "1"
+
+    context = """
+    FORMAT FOR THE LLM DESCRIPTIONS: 
+        **<Test Name>** is designed to <begin with a concise overview of what the test does and its primary purpose, 
+        extracted from the test description>.
+
+        The test operates by <write a paragraph about the test mechanism, explaining how it works and what it measures. 
+        Include any relevant formulas or methodologies mentioned in the test description.>
+
+        The primary advantages of this test include <write a paragraph about the test's strengths and capabilities, 
+        highlighting what makes it particularly useful for specific scenarios.>
+
+        Users should be aware that <write a paragraph about the test's limitations and potential risks. 
+        Include both technical limitations and interpretation challenges. 
+        If the test description includes specific signs of high risk, incorporate these here.>
+
+        **Key Insights:**
+
+        The test results reveal:
+
+        - **<insight title>**: <comprehensive description of one aspect of the results>
+        - **<insight title>**: <comprehensive description of another aspect>
+        ...
+
+        Based on these results, <conclude with a brief paragraph that ties together the test results with the test's 
+        purpose and provides any final recommendations or considerations.>
+
+    ADDITIONAL INSTRUCTIONS:
+        Present insights in order from general to specific, with each insight as a single bullet point with bold title.
+
+        For each metric in the test results, include in the test overview:
+        - The metric's purpose and what it measures
+        - Its mathematical formula
+        - The range of possible values
+        - What constitutes good/bad performance
+        - How to interpret different values
+
+        Each insight should progressively cover:
+        1. Overall scope and distribution
+        2. Complete breakdown of all elements with specific values
+        3. Natural groupings and patterns
+        4. Comparative analysis between datasets/categories
+        5. Stability and variations
+        6. Notable relationships or dependencies
+
+        Remember:
+        - Keep all insights at the same level (no sub-bullets or nested structures)
+        - Make each insight complete and self-contained
+        - Include specific numerical values and ranges
+        - Cover all elements in the results comprehensively
+        - Maintain clear, concise language
+        - Use only "- **Title**: Description" format for insights
+        - Progress naturally from general to specific observations
+
+    """.strip()
+
+    os.environ["VALIDMIND_LLM_DESCRIPTIONS_CONTEXT"] = context
+
+    # Load the data
+    df = load_data(source="offline", verbose=False)
+    preprocess_df = preprocess(df, verbose=False)
+    fe_df = feature_engineering(preprocess_df, verbose=False)
+
+    # Split the data
+    train_df, test_df = split(fe_df, test_size=0.2, verbose=False)
+
+    x_train = train_df.drop(target_column, axis=1)
+    y_train = train_df[target_column]
+
+    x_test = test_df.drop(target_column, axis=1)
+    y_test = test_df[target_column]
+
+    # Define the XGBoost model
+    xgb_model = xgb.XGBClassifier(
+        n_estimators=50, random_state=42, early_stopping_rounds=10
+    )
+    xgb_model.set_params(
+        eval_metric=["error", "logloss", "auc"],
+    )
+
+    # Fit the model
+    xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False)
+
+    # Define the Random Forest model
+    rf_model = RandomForestClassifier(
+        n_estimators=50,
+        random_state=42,
+    )
+
+    # Fit the model
+    rf_model.fit(x_train, y_train)
+
+    # Compute the probabilities
+    train_xgb_prob = xgb_model.predict_proba(x_train)[:, 1]
+    test_xgb_prob = xgb_model.predict_proba(x_test)[:, 1]
+
+    train_rf_prob = rf_model.predict_proba(x_train)[:, 1]
+    test_rf_prob = rf_model.predict_proba(x_test)[:, 1]
+
+    # Compute binary predictions
+    cut_off_threshold = 0.3
+
+    train_xgb_binary_predictions = (train_xgb_prob > cut_off_threshold).astype(int)
+    test_xgb_binary_predictions = (test_xgb_prob > cut_off_threshold).astype(int)
+
+    train_rf_binary_predictions = (train_rf_prob > cut_off_threshold).astype(int)
+    test_rf_binary_predictions = (test_rf_prob > cut_off_threshold).astype(int)
+
+    vm_raw_dataset = vm.init_dataset(
+        dataset=df,
+        input_id="raw_dataset",
+        target_column=target_column,
+    )
+
+    vm_preprocess_dataset = vm.init_dataset(
+        dataset=preprocess_df,
+        input_id="preprocess_dataset",
+        target_column=target_column,
+    )
+
+    vm_fe_dataset = vm.init_dataset(
+        dataset=fe_df,
+        input_id="fe_dataset",
+        target_column=target_column,
+    )
+
+    vm_train_ds = vm.init_dataset(
+        dataset=train_df,
+        input_id="train_dataset",
+        target_column=target_column,
+    )
+
+    vm_test_ds = vm.init_dataset(
+        dataset=test_df,
+        input_id="test_dataset",
+        target_column=target_column,
+    )
+
+    vm_xgb_model = vm.init_model(
+        xgb_model,
+        input_id="xgb_model",
+    )
+
+    vm_rf_model = vm.init_model(
+        rf_model,
+        input_id="rf_model",
+    )
+
+    # Assign predictions
+    vm_train_ds.assign_predictions(
+        model=vm_xgb_model,
+        prediction_values=train_xgb_binary_predictions,
+        prediction_probabilities=train_xgb_prob,
+    )
+
+    vm_test_ds.assign_predictions(
+        model=vm_xgb_model,
+        prediction_values=test_xgb_binary_predictions,
+        prediction_probabilities=test_xgb_prob,
+    )
+
+    vm_train_ds.assign_predictions(
+        model=vm_rf_model,
+        prediction_values=train_rf_binary_predictions,
+        prediction_probabilities=train_rf_prob,
+    )
+
+    vm_test_ds.assign_predictions(
+        model=vm_rf_model,
+        prediction_values=test_rf_binary_predictions,
+        prediction_probabilities=test_rf_prob,
+    )
+
+    # Compute credit risk scores
+    train_xgb_scores = compute_scores(train_xgb_prob)
+    test_xgb_scores = compute_scores(test_xgb_prob)
+
+    # Assign scores to the datasets
+    vm_train_ds.add_extra_column("xgb_scores", train_xgb_scores)
+    vm_test_ds.add_extra_column("xgb_scores", test_xgb_scores)
+
+    # Get the test config
+    test_config = get_demo_test_config(x_test, y_test)
+
+    vm.run_documentation_tests(config=test_config)
diff --git a/validmind/vm_models/dataset/dataset.py b/validmind/vm_models/dataset/dataset.py
index f8b82a351..25b65f70d 100644
--- a/validmind/vm_models/dataset/dataset.py
+++ b/validmind/vm_models/dataset/dataset.py
@@ -369,10 +369,6 @@ def add_extra_column(self, column_name, column_values=None):
         # reset feature columns to exclude the new extra column
         self._set_feature_columns()
 
-        logger.info(
-            f"Extra column {column_name} with {len(column_values)} values added to the dataset"
-        )
-
     @property
     def df(self) -> pd.DataFrame:
         """

From 4ff4ff11a74c8340e2b3fe5ff82efaaa1276f793 Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 10 Jan 2025 15:22:20 +0100
Subject: [PATCH 06/14] Remove credentials

---
 .../credit_risk/application_scorecard_with_ml.ipynb         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb b/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb
index e94859df7..05fb6cdb4 100644
--- a/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb
+++ b/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb
@@ -145,9 +145,9 @@
     "\n",
     "vm.init(\n",
     "  api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n",
-    "  api_key = \"f3e49f241081145facbbf59e93bcd8a9\",\n",
-    "  api_secret = \"c8dae73c5cc063cd070fa19508e625f60fe6dd18dddf96afed0d932ded91f530\",\n",
-    "  model = \"cm5fdpjre0lhq29iapn0uuwiw\"\n",
+    "  api_key = \"...\",\n",
+    "  api_secret = \"...\",\n",
+    "  model = \"...\"\n",
     ")"
    ]
   },

From e56b904f3a05e376dd11d4ba7c86698ddc55742e Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 10 Jan 2025 15:38:16 +0100
Subject: [PATCH 07/14] Fix lint

---
 .../datasets/credit_risk/lending_club.py      | 34 ++++++++-----------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/validmind/datasets/credit_risk/lending_club.py b/validmind/datasets/credit_risk/lending_club.py
index 72adc3d89..4a733635a 100644
--- a/validmind/datasets/credit_risk/lending_club.py
+++ b/validmind/datasets/credit_risk/lending_club.py
@@ -860,30 +860,24 @@ def document_model():
     os.environ["VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED"] = "1"
 
     context = """
-    FORMAT FOR THE LLM DESCRIPTIONS: 
-        **<Test Name>** is designed to <begin with a concise overview of what the test does and its primary purpose, 
-        extracted from the test description>.
+    FORMAT FOR THE LLM DESCRIPTIONS:
+    **<Test Name>** is designed to <begin with a concise overview of what the test does and its primary purpose, extracted from the test description>.
 
-        The test operates by <write a paragraph about the test mechanism, explaining how it works and what it measures. 
-        Include any relevant formulas or methodologies mentioned in the test description.>
+    The test operates by <write a paragraph about the test mechanism, explaining how it works and what it measures. Include any relevant formulas or methodologies mentioned in the test description.>
 
-        The primary advantages of this test include <write a paragraph about the test's strengths and capabilities, 
-        highlighting what makes it particularly useful for specific scenarios.>
+    The primary advantages of this test include <write a paragraph about the test's strengths and capabilities, highlighting what makes it particularly useful for specific scenarios.>
 
-        Users should be aware that <write a paragraph about the test's limitations and potential risks. 
-        Include both technical limitations and interpretation challenges. 
-        If the test description includes specific signs of high risk, incorporate these here.>
+    Users should be aware that <write a paragraph about the test's limitations and potential risks. Include both technical limitations and interpretation challenges. If the test description includes specific signs of high risk, incorporate these here.>
 
-        **Key Insights:**
+    **Key Insights:**
 
-        The test results reveal:
+    The test results reveal:
 
-        - **<insight title>**: <comprehensive description of one aspect of the results>
-        - **<insight title>**: <comprehensive description of another aspect>
-        ...
+    - **<insight title>**: <comprehensive description of one aspect of the results>
+    - **<insight title>**: <comprehensive description of another aspect>
+    ...
 
-        Based on these results, <conclude with a brief paragraph that ties together the test results with the test's 
-        purpose and provides any final recommendations or considerations.>
+    Based on these results, <conclude with a brief paragraph that ties together the test results with the test's purpose and provides any final recommendations or considerations.>
 
     ADDITIONAL INSTRUCTIONS:
         Present insights in order from general to specific, with each insight as a single bullet point with bold title.
@@ -966,19 +960,19 @@ def document_model():
     train_rf_binary_predictions = (train_rf_prob > cut_off_threshold).astype(int)
     test_rf_binary_predictions = (test_rf_prob > cut_off_threshold).astype(int)
 
-    vm_raw_dataset = vm.init_dataset(
+    vm.init_dataset(
         dataset=df,
         input_id="raw_dataset",
         target_column=target_column,
     )
 
-    vm_preprocess_dataset = vm.init_dataset(
+    vm.init_dataset(
         dataset=preprocess_df,
         input_id="preprocess_dataset",
         target_column=target_column,
     )
 
-    vm_fe_dataset = vm.init_dataset(
+    vm.init_dataset(
         dataset=fe_df,
         input_id="fe_dataset",
         target_column=target_column,

From 8dfbbc378f59ec45baaad4f62a9935c83de2a36b Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 10 Jan 2025 15:47:50 +0100
Subject: [PATCH 08/14] Skip ScoreBandsDrift and ScorecardHistogramDrift for
 integration testing

---
 tests/test_integration_tests.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_integration_tests.py b/tests/test_integration_tests.py
index 6da9996c3..f6e5b947d 100644
--- a/tests/test_integration_tests.py
+++ b/tests/test_integration_tests.py
@@ -49,6 +49,8 @@
     # The required column 'score' is not present in the dataset with input_id test_dataset
     "validmind.data_validation.ScoreBandDefaultRates",
     "validmind.model_validation.sklearn.ScoreProbabilityAlignment",
+    "validmind.ongoing_monitoring.ScoreBandsDrift",
+    "validmind.ongoing_monitoring.ScorecardHistogramDrift",
 ]
 SKIPPED_TESTS = []
 SUCCESSFUL_TESTS = []

From c96764edbf7cabb8ec9125aa5be17b5a36bb3107 Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 10 Jan 2025 17:32:58 +0100
Subject: [PATCH 09/14] Update notebooks

---
 .../application_scorecard_executive.ipynb     |   31 +-
 .../application_scorecard_with_ml.ipynb       | 1073 +++++++----------
 ...ication_scorecard_ongoing_monitoring.ipynb |  486 ++++----
 .../datasets/credit_risk/lending_club.py      |   63 +-
 4 files changed, 754 insertions(+), 899 deletions(-)

diff --git a/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb b/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb
index 3abde0eb3..3ee2b1e6b 100644
--- a/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb
+++ b/notebooks/code_samples/credit_risk/application_scorecard_executive.ipynb
@@ -160,8 +160,37 @@
    "outputs": [],
    "source": [
     "from validmind.datasets.credit_risk import lending_club\n",
+    "from validmind.utils import preview_test_config\n",
     "\n",
-    "lending_club.document_model()"
+    "scorecard = lending_club.load_scorecard()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lending_club.init_vm_objects(scorecard)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_config = lending_club.load_test_config(scorecard)\n",
+    "preview_test_config(test_config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vm.run_documentation_tests(config=test_config)"
    ]
   },
   {
diff --git a/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb b/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb
index 05fb6cdb4..9f2f5b29f 100644
--- a/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb
+++ b/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb
@@ -545,7 +545,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -648,15 +648,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.DatasetDescription:raw_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_raw_dataset,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.DatasetDescription:raw_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_raw_dataset,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -665,15 +662,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.DescriptiveStatistics:raw_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_raw_dataset,\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.DescriptiveStatistics:raw_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_raw_dataset,\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -682,18 +676,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.MissingValues:raw_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_raw_dataset,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_threshold\": 1\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.MissingValues:raw_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_raw_dataset,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_threshold\": 1\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -702,18 +693,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.ClassImbalance:raw_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_raw_dataset,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_percent_threshold\": 10\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.ClassImbalance:raw_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_raw_dataset,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_percent_threshold\": 10\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -722,18 +710,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.Duplicates:raw_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_raw_dataset,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_threshold\": 1\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.Duplicates:raw_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_raw_dataset,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_threshold\": 1\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -742,20 +727,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.HighCardinality:raw_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_raw_dataset,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"num_threshold\": 100,\n",
-    "            \"percent_threshold\": 0.1,\n",
-    "            \"threshold_type\": \"percent\"\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.HighCardinality:raw_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_raw_dataset,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"num_threshold\": 100,\n",
+    "        \"percent_threshold\": 0.1,\n",
+    "        \"threshold_type\": \"percent\"\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -764,18 +746,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.Skewness:raw_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_raw_dataset,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"max_threshold\": 1\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.Skewness:raw_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_raw_dataset,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"max_threshold\": 1\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -784,18 +763,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.UniqueRows:raw_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_raw_dataset,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_percent_threshold\": 1\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.UniqueRows:raw_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_raw_dataset,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_percent_threshold\": 1\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -804,18 +780,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.TooManyZeroValues:raw_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_raw_dataset,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"max_percent_threshold\": 0.03\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.TooManyZeroValues:raw_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_raw_dataset,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"max_percent_threshold\": 0.03\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -824,18 +797,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.IQROutliersTable:raw_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_raw_dataset,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"threshold\": 5\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.IQROutliersTable:raw_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_raw_dataset,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"threshold\": 5\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -853,15 +823,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.DescriptiveStatistics:preprocessed_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_preprocess_dataset,\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.DescriptiveStatistics:preprocessed_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_preprocess_dataset,\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -870,15 +837,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.TabularDescriptionTables:preprocessed_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_preprocess_dataset\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.TabularDescriptionTables:preprocessed_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_preprocess_dataset\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -887,18 +851,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.MissingValues:preprocessed_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_preprocess_dataset,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_threshold\": 1\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.MissingValues:preprocessed_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_preprocess_dataset,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_threshold\": 1\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -907,15 +868,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.TabularNumericalHistograms:preprocessed_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_preprocess_dataset\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.TabularNumericalHistograms:preprocessed_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_preprocess_dataset\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -924,15 +882,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.TabularCategoricalBarPlots:preprocessed_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_preprocess_dataset\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.TabularCategoricalBarPlots:preprocessed_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_preprocess_dataset\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -941,18 +896,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.TargetRateBarPlots:preprocessed_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_preprocess_dataset\n",
-    "        },\n",
-    "        params={\n",
-    "            \"default_column\": lending_club.target_column,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.TargetRateBarPlots:preprocessed_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_preprocess_dataset\n",
+    "    },\n",
+    "    params={\n",
+    "        \"default_column\": lending_club.target_column,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -968,15 +920,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.DescriptiveStatistics:development_data\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.DescriptiveStatistics:development_data\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -985,15 +934,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.TabularDescriptionTables:development_data\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.TabularDescriptionTables:development_data\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1002,18 +948,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.ClassImbalance:development_data\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_percent_threshold\": 10\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.ClassImbalance:development_data\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_percent_threshold\": 10\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1022,18 +965,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.UniqueRows:development_data\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_percent_threshold\": 1\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.UniqueRows:development_data\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_percent_threshold\": 1\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1042,15 +982,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.TabularNumericalHistograms:development_data\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.TabularNumericalHistograms:development_data\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1066,18 +1003,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.MutualInformation:development_data\",\n",
-    "        input_grid  ={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_threshold\": 0.01,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.MutualInformation:development_data\",\n",
+    "    input_grid  ={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_threshold\": 0.01,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1086,15 +1020,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.PearsonCorrelationMatrix:development_data\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.PearsonCorrelationMatrix:development_data\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1103,19 +1034,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.HighPearsonCorrelation:development_data\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"max_threshold\": 0.3,\n",
-    "            \"top_n_correlations\": 10\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.HighPearsonCorrelation:development_data\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"max_threshold\": 0.3,\n",
+    "        \"top_n_correlations\": 10\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1124,18 +1052,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.WOEBinTable\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_preprocess_dataset]\n",
-    "        },\n",
-    "        params={\n",
-    "            \"breaks_adj\": lending_club.breaks_adj,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.WOEBinTable\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_preprocess_dataset]\n",
+    "    },\n",
+    "    params={\n",
+    "        \"breaks_adj\": lending_club.breaks_adj,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1144,18 +1069,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.WOEBinPlots\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_preprocess_dataset]\n",
-    "        },\n",
-    "        params={\n",
-    "            \"breaks_adj\": lending_club.breaks_adj,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.WOEBinPlots\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_preprocess_dataset]\n",
+    "    },\n",
+    "    params={\n",
+    "        \"breaks_adj\": lending_club.breaks_adj,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1173,15 +1095,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.DatasetSplit\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_train_ds, vm_test_ds],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.DatasetSplit\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_train_ds, vm_test_ds],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1190,15 +1109,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.ModelMetadata\",\n",
-    "        input_grid={\n",
-    "            \"model\": [vm_xgb_model, vm_rf_model],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.ModelMetadata\",\n",
+    "    input_grid={\n",
+    "        \"model\": [vm_xgb_model, vm_rf_model],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1207,15 +1123,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.ModelParameters\",\n",
-    "        input_grid={\n",
-    "            \"model\": [vm_xgb_model, vm_rf_model],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.ModelParameters\",\n",
+    "    input_grid={\n",
+    "        \"model\": [vm_xgb_model, vm_rf_model],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1231,16 +1144,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.statsmodels.GINITable\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": [vm_xgb_model, vm_rf_model],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.statsmodels.GINITable\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": [vm_xgb_model, vm_rf_model],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1249,16 +1159,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.ClassifierPerformance\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": [vm_xgb_model, vm_rf_model],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.ClassifierPerformance\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": [vm_xgb_model, vm_rf_model],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1267,19 +1174,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.TrainingTestDegradation:XGBoost\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"max_threshold\": 0.1\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.TrainingTestDegradation:XGBoost\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"max_threshold\": 0.1\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1288,19 +1192,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.TrainingTestDegradation:RandomForest\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": vm_rf_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"max_threshold\": 0.1\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.TrainingTestDegradation:RandomForest\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": vm_rf_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"max_threshold\": 0.1\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1309,23 +1210,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    # Run the test\n",
-    "    result = run_test(\n",
-    "        \"validmind.model_validation.sklearn.HyperParametersTuning\",\n",
-    "        inputs={\n",
-    "            \"model\": vm_xgb_model,\n",
-    "            \"dataset\": vm_train_ds,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"param_grid\": {'n_estimators': [50, 100]},\n",
-    "            \"scoring\": ['roc_auc', 'recall'],\n",
-    "            \"fit_params\": {'eval_set': [(x_test, y_test)], 'verbose': False},\n",
-    "            \"thresholds\": [0.3, 0.5],\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.HyperParametersTuning\",\n",
+    "    inputs={\n",
+    "        \"model\": vm_xgb_model,\n",
+    "        \"dataset\": vm_train_ds,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"param_grid\": {'n_estimators': [50, 100]},\n",
+    "        \"scoring\": ['roc_auc', 'recall'],\n",
+    "        \"fit_params\": {'eval_set': [(x_test, y_test)], 'verbose': False},\n",
+    "        \"thresholds\": [0.3, 0.5],\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1343,16 +1240,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.ROCCurve\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": [vm_xgb_model],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.ROCCurve\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1361,19 +1255,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.MinimumROCAUCScore\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": [vm_xgb_model],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_threshold\": 0.5\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.MinimumROCAUCScore\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_threshold\": 0.5\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1382,16 +1273,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.statsmodels.PredictionProbabilitiesHistogram\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": [vm_xgb_model],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.statsmodels.PredictionProbabilitiesHistogram\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1400,16 +1288,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.statsmodels.CumulativePredictionProbabilities\",\n",
-    "        input_grid={\n",
-    "            \"model\": [vm_xgb_model],\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.statsmodels.CumulativePredictionProbabilities\",\n",
+    "    input_grid={\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1418,20 +1303,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.PopulationStabilityIndex\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"num_bins\": 10,\n",
-    "            \"mode\": \"fixed\"\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.PopulationStabilityIndex\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"num_bins\": 10,\n",
+    "        \"mode\": \"fixed\"\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1447,19 +1329,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.ClassifierThresholdOptimization\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_train_ds,\n",
-    "            \"model\": vm_xgb_model\n",
-    "        },\n",
-    "        params={\n",
-    "            \"target_recall\": 0.8 # Find a threshold that achieves a recall of 80%\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.ClassifierThresholdOptimization\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_train_ds,\n",
+    "        \"model\": vm_xgb_model\n",
+    "    },\n",
+    "    params={\n",
+    "        \"target_recall\": 0.8 # Find a threshold that achieves a recall of 80%\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1468,16 +1347,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.CalibrationCurve\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": [vm_xgb_model],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.CalibrationCurve\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1486,16 +1362,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.ConfusionMatrix\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": [vm_xgb_model],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.ConfusionMatrix\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1504,19 +1377,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.MinimumAccuracy\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": [vm_xgb_model],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_threshold\": 0.7\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.MinimumAccuracy\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_threshold\": 0.7\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1525,19 +1395,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.MinimumF1Score\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": [vm_xgb_model],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_threshold\": 0.5\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.MinimumF1Score\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_threshold\": 0.5\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1546,16 +1413,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.PrecisionRecallCurve\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": [vm_xgb_model]\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.PrecisionRecallCurve\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": [vm_xgb_model]\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1571,16 +1435,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.WeakspotsDiagnosis\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.WeakspotsDiagnosis\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1589,19 +1450,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.OverfitDiagnosis\",\n",
-    "        inputs={\n",
-    "            \"model\": vm_xgb_model,\n",
-    "            \"datasets\": [vm_train_ds, vm_test_ds],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"cut_off_threshold\": 0.04\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.OverfitDiagnosis\",\n",
+    "    inputs={\n",
+    "        \"model\": vm_xgb_model,\n",
+    "        \"datasets\": [vm_train_ds, vm_test_ds],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"cut_off_threshold\": 0.04\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1610,26 +1468,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.RobustnessDiagnosis\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"scaling_factor_std_dev_list\": [\n",
-    "                0.1,\n",
-    "                0.2,\n",
-    "                0.3,\n",
-    "                0.4,\n",
-    "                0.5\n",
-    "            ],\n",
-    "            \"performance_decay_threshold\": 0.05\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.RobustnessDiagnosis\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"scaling_factor_std_dev_list\": [\n",
+    "            0.1,\n",
+    "            0.2,\n",
+    "            0.3,\n",
+    "            0.4,\n",
+    "            0.5\n",
+    "        ],\n",
+    "        \"performance_decay_threshold\": 0.05\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1647,16 +1502,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.PermutationFeatureImportance\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "            \"model\": [vm_xgb_model]\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.PermutationFeatureImportance\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "        \"model\": [vm_xgb_model]\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1665,16 +1517,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.FeaturesAUC\",\n",
-    "        input_grid={\n",
-    "            \"model\": [vm_xgb_model],\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.FeaturesAUC\",\n",
+    "    input_grid={\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1683,20 +1532,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.SHAPGlobalImportance\",\n",
-    "        input_grid={\n",
-    "            \"model\": [vm_xgb_model],\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"kernel_explainer_samples\": 10,\n",
-    "            \"tree_or_linear_explainer_samples\": 200,\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.SHAPGlobalImportance\",\n",
+    "    input_grid={\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"kernel_explainer_samples\": 10,\n",
+    "        \"tree_or_linear_explainer_samples\": 200,\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1712,18 +1558,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.statsmodels.ScorecardHistogram\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds, vm_test_ds],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"score_column\": \"xgb_scores\",\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.statsmodels.ScorecardHistogram\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds, vm_test_ds],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"score_column\": \"xgb_scores\",\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1732,20 +1575,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.ScoreBandDefaultRates\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds],\n",
-    "            \"model\": [vm_xgb_model],\n",
-    "        },\n",
-    "        params = {\n",
-    "            \"score_column\": \"xgb_scores\",\n",
-    "            \"score_bands\": [500, 540, 570]\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.ScoreBandDefaultRates\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds],\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "    },\n",
+    "    params = {\n",
+    "        \"score_column\": \"xgb_scores\",\n",
+    "        \"score_bands\": [500, 540, 570]\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -1754,19 +1594,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.ScoreProbabilityAlignment\",\n",
-    "        input_grid={\n",
-    "            \"dataset\": [vm_train_ds],\n",
-    "            \"model\": [vm_xgb_model],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"score_column\": \"xgb_scores\",\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.ScoreProbabilityAlignment\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_train_ds],\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"score_column\": \"xgb_scores\",\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
diff --git a/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb b/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
index 4940ee53c..dbf83f620 100644
--- a/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
+++ b/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
@@ -135,7 +135,7 @@
     "  api_key = \"...\",\n",
     "  api_secret = \"...\",\n",
     "  model = \"...\",\n",
-    "  monitoring = True\n",
+    "  monitoring=True\n",
     ")"
    ]
   },
@@ -155,6 +155,7 @@
    "outputs": [],
    "source": [
     "import xgboost as xgb\n",
+    "import numpy as np\n",
     "\n",
     "from datetime import datetime, timedelta\n",
     "\n",
@@ -386,7 +387,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -487,93 +488,78 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.DescriptiveStatistics:monitoring_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_monitoring_ds,\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.DescriptiveStatistics:monitoring_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_monitoring_ds,\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.MissingValues:monitoring_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_monitoring_ds,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_threshold\": 1\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.MissingValues:monitoring_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_monitoring_ds,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_threshold\": 1\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.UniqueRows:monitoring_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_monitoring_ds,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"min_percent_threshold\": 1\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.UniqueRows:monitoring_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_monitoring_ds,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"min_percent_threshold\": 1\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.TabularNumericalHistograms:monitoring_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_monitoring_ds,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.TabularNumericalHistograms:monitoring_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_monitoring_ds,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.PearsonCorrelationMatrix:monitoring_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_monitoring_ds,\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.PearsonCorrelationMatrix:monitoring_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_monitoring_ds,\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -582,20 +568,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=True\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.data_validation.HighPearsonCorrelation:monitoring_data\",\n",
-    "        inputs={\n",
-    "            \"dataset\": vm_monitoring_ds,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"feature_columns\": vm_monitoring_ds.feature_columns,\n",
-    "            \"max_threshold\": 0.5,\n",
-    "            \"top_n_correlations\": 10\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.data_validation.HighPearsonCorrelation:monitoring_data\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_monitoring_ds,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"feature_columns\": vm_monitoring_ds.feature_columns,\n",
+    "        \"max_threshold\": 0.5,\n",
+    "        \"top_n_correlations\": 10\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -604,18 +587,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.ClassImbalanceDrift\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"drift_pct_threshold\": 1\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.ClassImbalanceDrift\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"drift_pct_threshold\": 1\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -639,41 +619,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.model_validation.sklearn.PopulationStabilityIndex\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.PopulationStabilityIndex\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.TargetPredictionDistributionPlot\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"drift_pct_threshold\": 5\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.TargetPredictionDistributionPlot\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"drift_pct_threshold\": 5\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -685,23 +659,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.PredictionCorrelation\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"drift_pct_threshold\": 5\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.PredictionCorrelation\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"drift_pct_threshold\": 5\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -713,20 +684,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.PredictionQuantilesAcrossFeatures\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.PredictionQuantilesAcrossFeatures\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -738,23 +706,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.FeatureDrift\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"psi_threshold\": 0.2,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.FeatureDrift\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"psi_threshold\": 0.2,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -770,66 +735,57 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.ClassificationAccuracyDrift\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"drift_pct_threshold\": 5,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.ClassificationAccuracyDrift\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"drift_pct_threshold\": 5,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.ConfusionMatrixDrift\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"drift_pct_threshold\": 5,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.ConfusionMatrixDrift\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"drift_pct_threshold\": 5,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.CalibrationCurveDrift\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"n_bins\": 10,\n",
-    "            \"drift_pct_threshold\": 10,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.CalibrationCurveDrift\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"n_bins\": 10,\n",
+    "        \"drift_pct_threshold\": 10,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -845,80 +801,68 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.ClassDiscriminationDrift\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"drift_pct_threshold\": 5,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.ClassDiscriminationDrift\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"drift_pct_threshold\": 5,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.ROCCurveDrift\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.ROCCurveDrift\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.PredictionProbabilitiesHistogramDrift\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"drift_pct_threshold\": 10,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.PredictionProbabilitiesHistogramDrift\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"drift_pct_threshold\": 10,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.CumulativePredictionProbabilitiesDrift\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        }\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.CumulativePredictionProbabilitiesDrift\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    }\n",
+    ").log()"
    ]
   },
   {
@@ -934,46 +878,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.ScorecardHistogramDrift\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "        },\n",
-    "        params={\n",
-    "            \"score_column\": \"xgb_scores\",\n",
-    "            \"drift_pct_threshold\": 20,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.ScorecardHistogramDrift\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"score_column\": \"xgb_scores\",\n",
+    "        \"drift_pct_threshold\": 20,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "run=False\n",
-    "if run:\n",
-    "\n",
-    "    run_test(\n",
-    "        \"validmind.ongoing_monitoring.ScoreBandsDrift\",\n",
-    "        inputs={\n",
-    "            \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
-    "            \"model\": vm_xgb_model,\n",
-    "        },\n",
-    "        params={\n",
-    "            \"score_column\": \"xgb_scores\",\n",
-    "            \"score_bands\": [500, 540, 570],\n",
-    "            \"drift_pct_threshold\": 20,\n",
-    "        },\n",
-    "    ).log()"
+    "run_test(\n",
+    "    \"validmind.ongoing_monitoring.ScoreBandsDrift\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"score_column\": \"xgb_scores\",\n",
+    "        \"score_bands\": [500, 540, 570],\n",
+    "        \"drift_pct_threshold\": 20,\n",
+    "    },\n",
+    ").log()"
    ]
   },
   {
@@ -1085,10 +1023,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import numpy as np\n",
-    "\n",
     "NUM_DAYS = 10\n",
-    "base_date = datetime.now() - timedelta(days=NUM_DAYS)\n",
+    "REFERENCE_DATE = datetime(2024, 1, 1)  # Fixed date: January 1st, 2024\n",
+    "base_date = REFERENCE_DATE - timedelta(days=NUM_DAYS)\n",
+    "\n",
     "\n",
     "# Initial values\n",
     "performance_metrics = {\n",
diff --git a/validmind/datasets/credit_risk/lending_club.py b/validmind/datasets/credit_risk/lending_club.py
index 4a733635a..234b66031 100644
--- a/validmind/datasets/credit_risk/lending_club.py
+++ b/validmind/datasets/credit_risk/lending_club.py
@@ -852,7 +852,7 @@ def get_demo_test_config(x_test=None, y_test=None):
     return default_config
 
 
-def document_model():
+def load_scorecard():
 
     warnings.filterwarnings("ignore")
     logging.getLogger("scorecardpy").setLevel(logging.ERROR)
@@ -960,6 +960,55 @@ def document_model():
     train_rf_binary_predictions = (train_rf_prob > cut_off_threshold).astype(int)
     test_rf_binary_predictions = (test_rf_prob > cut_off_threshold).astype(int)
 
+    # Compute credit risk scores
+    train_xgb_scores = compute_scores(train_xgb_prob)
+    test_xgb_scores = compute_scores(test_xgb_prob)
+
+    scorecard = {
+        "df": df,
+        "preprocess_df": preprocess_df,
+        "fe_df": fe_df,
+        "train_df": train_df,
+        "test_df": test_df,
+        "x_test": x_test,
+        "y_test": y_test,
+        "xgb_model": xgb_model,
+        "rf_model": rf_model,
+        "train_xgb_binary_predictions": train_xgb_binary_predictions,
+        "test_xgb_binary_predictions": test_xgb_binary_predictions,
+        "train_xgb_prob": train_xgb_prob,
+        "test_xgb_prob": test_xgb_prob,
+        "train_xgb_scores": train_xgb_scores,
+        "test_xgb_scores": test_xgb_scores,
+        "train_rf_binary_predictions": train_rf_binary_predictions,
+        "test_rf_binary_predictions": test_rf_binary_predictions,
+        "train_rf_prob": train_rf_prob,
+        "test_rf_prob": test_rf_prob,
+    }
+
+    return scorecard
+
+
+def init_vm_objects(scorecard):
+
+    df = scorecard["df"]
+    preprocess_df = scorecard["preprocess_df"]
+    fe_df = scorecard["fe_df"]
+    train_df = scorecard["train_df"]
+    test_df = scorecard["test_df"]
+    xgb_model = scorecard["xgb_model"]
+    rf_model = scorecard["rf_model"]
+    train_xgb_binary_predictions = scorecard["train_xgb_binary_predictions"]
+    test_xgb_binary_predictions = scorecard["test_xgb_binary_predictions"]
+    train_xgb_prob = scorecard["train_xgb_prob"]
+    test_xgb_prob = scorecard["test_xgb_prob"]
+    train_rf_binary_predictions = scorecard["train_rf_binary_predictions"]
+    test_rf_binary_predictions = scorecard["test_rf_binary_predictions"]
+    train_rf_prob = scorecard["train_rf_prob"]
+    test_rf_prob = scorecard["test_rf_prob"]
+    train_xgb_scores = scorecard["train_xgb_scores"]
+    test_xgb_scores = scorecard["test_xgb_scores"]
+
     vm.init_dataset(
         dataset=df,
         input_id="raw_dataset",
@@ -1025,15 +1074,17 @@ def document_model():
         prediction_probabilities=test_rf_prob,
     )
 
-    # Compute credit risk scores
-    train_xgb_scores = compute_scores(train_xgb_prob)
-    test_xgb_scores = compute_scores(test_xgb_prob)
-
     # Assign scores to the datasets
     vm_train_ds.add_extra_column("xgb_scores", train_xgb_scores)
     vm_test_ds.add_extra_column("xgb_scores", test_xgb_scores)
 
+
+def load_test_config(scorecard):
+
+    x_test = scorecard["x_test"]
+    y_test = scorecard["y_test"]
+
     # Get the test config
     test_config = get_demo_test_config(x_test, y_test)
 
-    vm.run_documentation_tests(config=test_config)
+    return test_config

From d2ac64eed03148a4af690de8b14a16df647b1d47 Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 10 Jan 2025 17:43:08 +0100
Subject: [PATCH 10/14] Added some more testing to ongoing monitoring notebook

---
 ...ication_scorecard_ongoing_monitoring.ipynb | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb b/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
index dbf83f620..f65efa1f3 100644
--- a/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
+++ b/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
@@ -914,6 +914,78 @@
     ").log()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Diagnostic monitoring"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.WeakspotsDiagnosis\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    ").log()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.OverfitDiagnosis\",\n",
+    "    inputs={\n",
+    "        \"model\": vm_xgb_model,\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"cut_off_threshold\": 0.04\n",
+    "    }\n",
+    ").log()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Robustness monitoring"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.RobustnessDiagnosis\",\n",
+    "    inputs={\n",
+    "        \"datasets\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"scaling_factor_std_dev_list\": [\n",
+    "            0.1,\n",
+    "            0.2,\n",
+    "            0.3,\n",
+    "            0.4,\n",
+    "            0.5\n",
+    "        ],\n",
+    "        \"performance_decay_threshold\": 0.05\n",
+    "    }\n",
+    ").log()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

From 9c24146565ecfe2fcf6bbe2c722cfeac9296c36f Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 10 Jan 2025 18:44:56 +0100
Subject: [PATCH 11/14] Added explainability testing to ongoing monitoring

---
 ...ication_scorecard_ongoing_monitoring.ipynb | 58 ++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb b/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
index f65efa1f3..ab5d6d4bf 100644
--- a/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
+++ b/notebooks/code_samples/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb
@@ -135,7 +135,7 @@
     "  api_key = \"...\",\n",
     "  api_secret = \"...\",\n",
     "  model = \"...\",\n",
-    "  monitoring=True\n",
+    "  monitoring = True\n",
     ")"
    ]
   },
@@ -914,6 +914,62 @@
     ").log()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Model insights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.PermutationFeatureImportance\",\n",
+    "    input_grid={\n",
+    "        \"dataset\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "        \"model\": [vm_xgb_model]\n",
+    "    }\n",
+    ").log()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_test(\n",
+    "    \"validmind.model_validation.FeaturesAUC\",\n",
+    "    input_grid={\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "        \"dataset\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "    },\n",
+    ").log()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_test(\n",
+    "    \"validmind.model_validation.sklearn.SHAPGlobalImportance\",\n",
+    "    input_grid={\n",
+    "        \"model\": [vm_xgb_model],\n",
+    "        \"dataset\": [vm_reference_ds, vm_monitoring_ds],\n",
+    "    },\n",
+    "    params={\n",
+    "        \"kernel_explainer_samples\": 10,\n",
+    "        \"tree_or_linear_explainer_samples\": 200,\n",
+    "    }\n",
+    ").log()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

From a2a7ebea6f063bfb72811fbe4b6a318bfa112b0e Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 10 Jan 2025 20:35:50 +0100
Subject: [PATCH 12/14] Add custom tests to notebook

---
 .../application_scorecard_with_ml.ipynb       | 153 ++++++++++++++
 .../ScoreBandDiscriminationMetrics.py         | 193 ++++++++++++++++++
 .../CalibrationCurveDrift.py                  |  48 ++++-
 .../ClassDiscriminationDrift.py               |  43 +++-
 .../ongoing_monitoring/ClassImbalanceDrift.py |  49 ++++-
 .../ClassificationAccuracyDrift.py            |  40 +++-
 .../ConfusionMatrixDrift.py                   |  42 +++-
 .../CumulativePredictionProbabilitiesDrift.py |  49 ++++-
 .../PredictionProbabilitiesHistogramDrift.py  |  51 +++--
 .../tests/ongoing_monitoring/ROCCurveDrift.py |  42 +++-
 .../ongoing_monitoring/ScoreBandsDrift.py     |  51 +++--
 .../ScorecardHistogramDrift.py                |  48 ++++-
 12 files changed, 722 insertions(+), 87 deletions(-)
 create mode 100644 notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py

diff --git a/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb b/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb
index 9f2f5b29f..26a983f10 100644
--- a/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb
+++ b/notebooks/code_samples/credit_risk/application_scorecard_with_ml.ipynb
@@ -1606,6 +1606,159 @@
     ").log()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Custom tests\n",
+    "\n",
+    "Custom tests extend the functionality of ValidMind, allowing you to document any model or use case with added flexibility.\n",
+    "\n",
+    "ValidMind provides a comprehensive set of tests out-of-the-box to evaluate and document your models and datasets. We recognize there will be cases where the default tests do not support a model or dataset, or specific documentation is needed. In these cases, you can create and use your own custom code to accomplish what you need. To streamline custom code integration, we support the creation of custom test functions."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### In-line custom tests\n",
+    "\n",
+    "The `@vm.test` decorator is doing the work of creating a wrapper around the function that will allow it to be run by the ValidMind Library. It also registers the test so it can be found by the ID `my_custom_tests.ScoreToOdds\"`. The function `score_to_odds_analysis` takes three arguments `dataset`, `score_column`, and `score_bands`. This is a `VMDataset` and the rest are parameters that can be passed in."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import plotly.graph_objects as go\n",
+    "\n",
+    "\n",
+    "@vm.test(\"my_custom_tests.ScoreToOdds\")\n",
+    "def score_to_odds_analysis(dataset, score_column='score', score_bands=[410, 440, 470]):\n",
+    "    \"\"\"\n",
+    "    Analyzes the relationship between score bands and odds (good:bad ratio).\n",
+    "    Good odds = (1 - default_rate) / default_rate\n",
+    "    \n",
+    "    Higher scores should correspond to higher odds of being good.\n",
+    "    \"\"\"\n",
+    "    df = dataset.df\n",
+    "    \n",
+    "    # Create score bands\n",
+    "    df['score_band'] = pd.cut(\n",
+    "        df[score_column],\n",
+    "        bins=[-np.inf] + score_bands + [np.inf],\n",
+    "        labels=[f'<{score_bands[0]}'] + \n",
+    "               [f'{score_bands[i]}-{score_bands[i+1]}' for i in range(len(score_bands)-1)] +\n",
+    "               [f'>{score_bands[-1]}']\n",
+    "    )\n",
+    "    \n",
+    "    # Calculate metrics per band\n",
+    "    results = df.groupby('score_band').agg({\n",
+    "        dataset.target_column: ['mean', 'count']\n",
+    "    })\n",
+    "    \n",
+    "    results.columns = ['Default Rate', 'Total']\n",
+    "    results['Good Count'] = results['Total'] - (results['Default Rate'] * results['Total'])\n",
+    "    results['Bad Count'] = results['Default Rate'] * results['Total']\n",
+    "    results['Odds'] = results['Good Count'] / results['Bad Count']\n",
+    "    \n",
+    "    # Create visualization\n",
+    "    fig = go.Figure()\n",
+    "    \n",
+    "    # Add odds bars\n",
+    "    fig.add_trace(go.Bar(\n",
+    "        name='Odds (Good:Bad)',\n",
+    "        x=results.index,\n",
+    "        y=results['Odds'],\n",
+    "        marker_color='blue'\n",
+    "    ))\n",
+    "    \n",
+    "    fig.update_layout(\n",
+    "        title='Score-to-Odds Analysis',\n",
+    "        yaxis=dict(title='Odds Ratio (Good:Bad)'),\n",
+    "        showlegend=False\n",
+    "    )\n",
+    "    \n",
+    "    return fig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_test(\n",
+    "    \"my_custom_tests.ScoreToOdds\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_test_ds,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"score_column\": \"xgb_scores\",\n",
+    "        \"score_bands\": [500, 540, 570],\n",
+    "    },\n",
+    ").log()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Local test provider\n",
+    "\n",
+    "The ValidMind Library offers the ability to extend the built-in library of tests with custom tests. A test \"Provider\" is a Python class that gets registered with the ValidMind Library and loads tests based on a test ID, for example `my_test_provider.my_test_id`. The built-in suite of tests that ValidMind offers is technically its own test provider. You can use one the built-in test provider offered by ValidMind (`validmind.tests.test_providers.LocalTestProvider`) or you can create your own. More than likely, you'll want to use the `LocalTestProvider` to add a directory of custom tests but there's flexibility to be able to load tests from any source."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from validmind.tests import LocalTestProvider\n",
+    "\n",
+    "# Define the folder where your tests are located\n",
+    "tests_folder = \"custom_tests\"\n",
+    "\n",
+    "# initialize the test provider with the tests folder we created earlier\n",
+    "my_test_provider = LocalTestProvider(tests_folder)\n",
+    "\n",
+    "vm.tests.register_test_provider(\n",
+    "    namespace=\"my_test_provider\",\n",
+    "    test_provider=my_test_provider,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have our test provider set up, we can run any test that's located in our tests folder by using the `run_test()` method. This function is your entry point to running single tests in the ValidMind Library. It takes a test ID and runs the test associated with that ID. For our custom tests, the test ID will be the `namespace` specified when registering the provider, followed by the path to the test file relative to the tests folder. For example, the Confusion Matrix test we created earlier will have the test ID `my_test_provider.ConfusionMatrix`. You could organize the tests in subfolders, say `classification` and `regression`, and the test ID for the Confusion Matrix test would then be `my_test_provider.classification.ConfusionMatrix`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_test(\n",
+    "    \"my_test_provider.ScoreBandDiscriminationMetrics\",\n",
+    "    inputs={\n",
+    "        \"dataset\": vm_test_ds,\n",
+    "        \"model\": vm_xgb_model,\n",
+    "    },\n",
+    "    params={\n",
+    "        \"score_column\": \"xgb_scores\",\n",
+    "        \"score_bands\": [500, 540, 570],\n",
+    "    }\n",
+    ").log(section_id=\"interpretability_insights\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py b/notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py
new file mode 100644
index 000000000..62127d82e
--- /dev/null
+++ b/notebooks/code_samples/credit_risk/custom_tests/ScoreBandDiscriminationMetrics.py
@@ -0,0 +1,193 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from sklearn.metrics import roc_curve, roc_auc_score
+from typing import Tuple
+from validmind import tags, tasks
+from validmind.vm_models import VMDataset, VMModel
+
+
+@tags("visualization", "credit_risk", "scorecard")
+@tasks("classification")
+def ScoreBandDiscriminationMetrics(
+    dataset: VMDataset,
+    model: VMModel,
+    score_column: str = "score",
+    score_bands: list = None,
+    title: str = "Score Band Discrimination Metrics",
+) -> Tuple[go.Figure, pd.DataFrame]:
+    """
+    Evaluates discrimination metrics (AUC, GINI, KS) across different score bands for credit risk assessment.
+
+    ### Purpose
+
+    The Score Band Discrimination Metrics test is designed to evaluate the model's discriminatory power across
+    different score ranges. By segmenting the score distribution into bands and calculating key discrimination
+    metrics within each band, this test helps identify whether the model maintains consistent performance across
+    the entire score spectrum. This is crucial for understanding if the model's ability to separate good and bad
+    accounts varies significantly across different score ranges.
+
+    ### Test Mechanism
+
+    This test proceeds by first segmenting the score distribution into predefined bands. For each band, it
+    calculates three key discrimination metrics: AUC (Area Under the Curve), GINI coefficient, and KS
+    (Kolmogorov-Smirnov) statistic. The AUC measures the model's ability to rank order risk, the GINI
+    coefficient provides a measure of inequality in the predictions, and the KS statistic quantifies the maximum
+    separation between cumulative distributions. The test also tracks the population distribution and default
+    rates across bands to provide context for the discrimination metrics.
+
+    ### Signs of High Risk
+
+    - Significant variations in discrimination metrics between adjacent score bands
+    - Very low metric values in specific score ranges, indicating poor discrimination
+    - Inconsistent patterns in metric values across the score spectrum
+    - Large disparities between band-specific metrics and overall metrics
+    - Unexpected relationships between default rates and discrimination metrics
+    - Insufficient population in certain score bands for reliable metric calculation
+
+    ### Strengths
+
+    - Provides a comprehensive view of model discrimination across the score spectrum
+    - Combines multiple complementary metrics for robust performance assessment
+    - Identifies specific score ranges where model performance might be suboptimal
+    - Includes population and default rate context for better interpretation
+    - Handles edge cases such as single-class bands and insufficient data
+    - Enables visual comparison of metrics across score bands
+
+    ### Limitations
+
+    - Requires sufficient data in each score band for reliable metric calculation
+    - May be sensitive to the choice of score band boundaries
+    - Does not account for business importance of different score ranges
+    - Metrics may be unstable in bands with very low default rates
+    - Cannot directly suggest optimal score band boundaries
+    - Limited to assessing discrimination aspects of model performance
+    """
+    if score_column not in dataset.df.columns:
+        raise ValueError(f"Score column '{score_column}' not found in dataset")
+
+    df = dataset.df.copy()
+
+    # Default score bands if none provided
+    if score_bands is None:
+        score_bands = [410, 440, 470]
+
+    # Create band labels
+    band_labels = [
+        f"{score_bands[i]}-{score_bands[i+1]}" for i in range(len(score_bands) - 1)
+    ]
+    band_labels.insert(0, f"<{score_bands[0]}")
+    band_labels.append(f">{score_bands[-1]}")
+
+    # Bin the scores
+    df["score_band"] = pd.cut(
+        df[score_column], bins=[-np.inf] + score_bands + [np.inf], labels=band_labels
+    )
+
+    # Calculate metrics for each band
+    results = []
+    for band in band_labels:
+        band_mask = df["score_band"] == band
+        if band_mask.sum() > 1:  # Need at least 2 samples
+            y_true = df[band_mask][dataset.target_column].values
+            y_prob = dataset.y_prob(model)[
+                band_mask
+            ]  # Get predicted probabilities using dataset method
+
+            # Convert to float arrays
+            y_true = np.array(y_true, dtype=float)
+            y_prob = np.array(y_prob, dtype=float)
+
+            # Calculate metrics
+            try:
+                fpr, tpr, _ = roc_curve(y_true, y_prob)
+                ks = max(tpr - fpr)
+                auc = roc_auc_score(y_true, y_prob)
+                gini = 2 * auc - 1
+            except ValueError:  # Handle cases with single class
+                ks, auc, gini = 0, 0.5, 0
+
+            results.append(
+                {
+                    "Score Band": band,
+                    "Population Count": band_mask.sum(),
+                    "Population (%)": (band_mask.sum() / len(df)) * 100,
+                    "AUC": auc,
+                    "GINI": gini,
+                    "KS": ks,
+                    "Default Rate (%)": (y_true.mean() * 100),
+                }
+            )
+
+    # Calculate total metrics
+    y_true = df[dataset.target_column].values
+    y_prob = dataset.y_prob(model)  # Get predicted probabilities for total calculation
+
+    fpr, tpr, _ = roc_curve(y_true, y_prob)
+    total_ks = max(tpr - fpr)
+    total_auc = roc_auc_score(y_true, y_prob)
+    total_gini = 2 * total_auc - 1
+
+    # Add total row
+    results.append(
+        {
+            "Score Band": f"Total ({df[score_column].min():.0f}-{df[score_column].max():.0f})",
+            "Population Count": len(df),
+            "Population (%)": 100.0,
+            "AUC": total_auc,
+            "GINI": total_gini,
+            "KS": total_ks,
+            "Default Rate (%)": (y_true.mean() * 100),
+        }
+    )
+
+    results_df = pd.DataFrame(results)
+
+    # Create visualization (excluding total)
+    fig = go.Figure()
+
+    # Filter out the total row for plotting
+    plot_df = results_df[results_df["Score Band"].str.contains("Total") == False]
+
+    # Add metric bars
+    for metric, color in [
+        ("AUC", "rgb(31, 119, 180)"),
+        ("GINI", "rgb(255, 127, 14)"),
+        ("KS", "rgb(44, 160, 44)"),
+    ]:
+        fig.add_trace(
+            go.Bar(
+                name=metric,
+                x=plot_df["Score Band"],
+                y=plot_df[metric],
+                marker_color=color,
+            )
+        )
+
+    # Add default rate line (excluding total)
+    fig.add_trace(
+        go.Scatter(
+            name="Default Rate (%)",
+            x=plot_df["Score Band"],
+            y=plot_df["Default Rate (%)"],
+            yaxis="y2",
+            line=dict(color="red", width=2),
+        )
+    )
+
+    # Update layout
+    fig.update_layout(
+        title=title,
+        xaxis_title="Score Band",
+        yaxis_title="Discrimination Metrics",
+        yaxis2=dict(title="Default Rate (%)", overlaying="y", side="right"),
+        barmode="group",
+        showlegend=True,
+        height=600,
+    )
+
+    return fig, results_df
diff --git a/validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py b/validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py
index 42db8b95a..257cf5ff2 100644
--- a/validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py
+++ b/validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py
@@ -26,22 +26,52 @@ def CalibrationCurveDrift(
     drift_pct_threshold: float = 20,
 ):
     """
-    Compares calibration curves between reference and monitoring datasets.
+    Evaluates changes in probability calibration between reference and monitoring datasets.
 
     ### Purpose
-    This test visualizes and quantifies differences in probability calibration between
-    reference and monitoring datasets to identify changes in model's probability estimates.
+
+    The Calibration Curve Drift test is designed to assess changes in the model's probability calibration
+    over time. By comparing calibration curves between reference and monitoring datasets, this test helps
+    identify whether the model's probability estimates remain reliable in production. This is crucial for
+    understanding if the model's risk predictions maintain their intended interpretation and whether
+    recalibration might be necessary.
 
     ### Test Mechanism
-    Generates a plot with superimposed calibration curves and two tables comparing:
-    1. Mean predicted probabilities per bin
-    2. Actual fraction of positives per bin
+
+    This test proceeds by generating calibration curves for both reference and monitoring datasets. For each
+    dataset, it bins the predicted probabilities and calculates the actual fraction of positives within each
+    bin. It then compares these values between datasets to identify significant shifts in calibration.
+    The test quantifies drift as percentage changes in both mean predicted probabilities and actual fractions
+    of positives per bin, providing both visual and numerical assessments of calibration stability.
 
     ### Signs of High Risk
-    - Large differences between calibration curves
-    - Systematic over/under-estimation in monitoring dataset
-    - Changes in calibration for specific probability ranges
+
+    - Large differences between reference and monitoring calibration curves
+    - Systematic over-estimation or under-estimation in monitoring dataset
+    - Significant drift percentages exceeding the threshold in multiple bins
+    - Changes in calibration concentrated in specific probability ranges
+    - Inconsistent drift patterns across the probability spectrum
+    - Empty or sparse bins indicating insufficient data for reliable comparison
+
+    ### Strengths
+
+    - Provides visual and quantitative assessment of calibration changes
+    - Identifies specific probability ranges where calibration has shifted
+    - Enables early detection of systematic prediction biases
+    - Includes detailed bin-by-bin comparison of calibration metrics
+    - Handles edge cases with insufficient data in certain bins
+    - Supports both binary and probabilistic interpretation of results
+
+    ### Limitations
+
+    - Requires sufficient data in each probability bin for reliable comparison
+    - Sensitive to choice of number of bins and binning strategy
+    - May not capture complex changes in probability distributions
+    - Cannot directly suggest recalibration parameters
+    - Limited to assessing probability calibration aspects
+    - Results may be affected by class imbalance changes
     """
+
     # Check for binary classification
     if len(np.unique(datasets[0].y)) > 2:
         raise SkipTestError(
diff --git a/validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py b/validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py
index 169f47ac8..c90a45a79 100644
--- a/validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py
+++ b/validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py
@@ -41,16 +41,47 @@ def ClassDiscriminationDrift(
     Compares classification discrimination metrics between reference and monitoring datasets.
 
     ### Purpose
-    This test evaluates drift in discrimination metrics including ROC AUC, GINI, and KS statistics.
+
+    The Class Discrimination Drift test is designed to evaluate changes in the model's discriminative power
+    over time. By comparing key discrimination metrics between reference and monitoring datasets, this test
+    helps identify whether the model maintains its ability to separate classes in production. This is crucial
+    for understanding if the model's predictive power remains stable and whether its decision boundaries
+    continue to effectively distinguish between different classes.
 
     ### Test Mechanism
-    Calculates discrimination metrics for both reference and monitoring datasets and
-    compares them to identify significant changes in model's discriminative power.
+
+    This test proceeds by calculating three key discrimination metrics for both reference and monitoring
+    datasets: ROC AUC (Area Under the Curve), GINI coefficient, and KS (Kolmogorov-Smirnov) statistic.
+    For binary classification, it computes all three metrics. For multiclass problems, it focuses on
+    macro-averaged ROC AUC. The test quantifies drift as percentage changes in these metrics between
+    datasets, providing a comprehensive assessment of discrimination stability.
 
     ### Signs of High Risk
-    - Large drifts in discrimination metrics (above threshold)
-    - Significant drops in ROC AUC or GINI coefficient
-    - Reduced class separation as indicated by KS statistic
+
+    - Large drifts in discrimination metrics exceeding the threshold
+    - Significant drops in ROC AUC indicating reduced ranking ability
+    - Decreased GINI coefficients showing diminished separation power
+    - Reduced KS statistics suggesting weaker class distinction
+    - Inconsistent changes across different metrics
+    - Systematic degradation in discriminative performance
+
+    ### Strengths
+
+    - Combines multiple complementary discrimination metrics
+    - Handles both binary and multiclass classification
+    - Provides clear quantitative drift assessment
+    - Enables early detection of model degradation
+    - Includes standardized drift threshold evaluation
+    - Supports comprehensive performance monitoring
+
+    ### Limitations
+
+    - Does not identify root causes of discrimination drift
+    - May be sensitive to changes in class distribution
+    - Cannot suggest optimal decision threshold adjustments
+    - Limited to discrimination aspects of performance
+    - Requires sufficient data for reliable metric calculation
+    - May not capture subtle changes in decision boundaries
     """
     # Get predictions and true values
     y_true_ref = datasets[0].y
diff --git a/validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py b/validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py
index aa604f611..f11cee520 100644
--- a/validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py
+++ b/validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py
@@ -16,25 +16,52 @@ def ClassImbalanceDrift(
     datasets: List[VMDataset],
     drift_pct_threshold: float = 5.0,
     title: str = "Class Distribution Drift",
-) -> Tuple[go.Figure, dict, bool]:
+):
     """
     Evaluates drift in class distribution between reference and monitoring datasets.
 
     ### Purpose
-    This test compares the class distribution between two datasets to identify
-    potential population drift in the target variable.
+
+    The Class Imbalance Drift test is designed to detect changes in the distribution of target classes
+    over time. By comparing class proportions between reference and monitoring datasets, this test helps
+    identify whether the population structure remains stable in production. This is crucial for
+    understanding if the model continues to operate under similar class distribution assumptions and
+    whether retraining might be necessary due to significant shifts in class balance.
 
     ### Test Mechanism
-    - Calculates class percentages for both datasets
-    - Computes drift as the difference in percentages
-    - Visualizes distributions side by side
-    - Flags significant changes in class proportions
+
+    This test proceeds by calculating class percentages for both reference and monitoring datasets.
+    It computes the proportion of each class and quantifies drift as the percentage difference in these
+    proportions between datasets. The test provides both visual and numerical comparisons of class
+    distributions, with special attention to changes that exceed the specified drift threshold.
+    Population stability is assessed on a class-by-class basis.
 
     ### Signs of High Risk
-    - Large shifts in class proportions
-    - New classes appearing or existing classes disappearing
-    - Multiple classes showing significant drift
-    - Systematic shifts across multiple classes
+
+    - Large shifts in class proportions exceeding the threshold
+    - Systematic changes affecting multiple classes
+    - Appearance of new classes or disappearance of existing ones
+    - Significant changes in minority class representation
+    - Reversal of majority-minority class relationships
+    - Unexpected changes in class ratios
+
+    ### Strengths
+
+    - Provides clear visualization of distribution changes
+    - Identifies specific classes experiencing drift
+    - Enables early detection of population shifts
+    - Includes standardized drift threshold evaluation
+    - Supports both binary and multiclass problems
+    - Maintains interpretable percentage-based metrics
+
+    ### Limitations
+
+    - Does not account for feature distribution changes
+    - Cannot identify root causes of class drift
+    - May be sensitive to small sample sizes
+    - Limited to target variable distribution only
+    - Requires sufficient samples per class
+    - May not capture subtle distribution changes
     """
     # Validate inputs
     if not datasets[0].target_column or not datasets[1].target_column:
diff --git a/validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py b/validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py
index e07106a4c..819d284be 100644
--- a/validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py
+++ b/validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py
@@ -21,17 +21,47 @@ def ClassificationAccuracyDrift(
     Compares classification accuracy metrics between reference and monitoring datasets.
 
     ### Purpose
-    This test evaluates drift in classification accuracy metrics including per-label and
-    macro-averaged precision, recall, and F1 scores.
+
+    The Classification Accuracy Drift test is designed to evaluate changes in the model's predictive accuracy
+    over time. By comparing key accuracy metrics between reference and monitoring datasets, this test helps
+    identify whether the model maintains its performance levels in production. This is crucial for
+    understanding if the model's predictions remain reliable and whether its overall effectiveness has
+    degraded significantly.
 
     ### Test Mechanism
-    Calculates classification metrics for both reference and monitoring datasets and
-    compares them to identify significant changes in model performance.
+
+    This test proceeds by calculating comprehensive accuracy metrics for both reference and monitoring
+    datasets. It computes overall accuracy, per-label precision, recall, and F1 scores, as well as
+    macro-averaged metrics. The test quantifies drift as percentage changes in these metrics between
+    datasets, providing both granular and aggregate views of accuracy changes. Special attention is paid
+    to per-label performance to identify class-specific degradation.
 
     ### Signs of High Risk
-    - Large drifts in accuracy metrics (above threshold)
+
+    - Large drifts in accuracy metrics exceeding the threshold
     - Inconsistent changes across different labels
     - Significant drops in macro-averaged metrics
+    - Systematic degradation in specific class performance
+    - Unexpected improvements suggesting data quality issues
+    - Divergent trends between precision and recall
+
+    ### Strengths
+
+    - Provides comprehensive accuracy assessment
+    - Identifies class-specific performance changes
+    - Enables early detection of model degradation
+    - Includes both micro and macro perspectives
+    - Supports multi-class classification evaluation
+    - Maintains interpretable drift thresholds
+
+    ### Limitations
+
+    - May be sensitive to class distribution changes
+    - Does not account for prediction confidence
+    - Cannot identify root causes of accuracy drift
+    - Limited to accuracy-based metrics only
+    - Requires sufficient samples per class
+    - May not capture subtle performance changes
     """
     # Get predictions and true values
     y_true_ref = datasets[0].y
diff --git a/validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py b/validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py
index 8edf0a5fe..766c716c0 100644
--- a/validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py
+++ b/validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py
@@ -21,17 +21,47 @@ def ConfusionMatrixDrift(
     Compares confusion matrix metrics between reference and monitoring datasets.
 
     ### Purpose
-    This test evaluates drift in confusion matrix elements including True Positives,
-    True Negatives, False Positives, and False Negatives.
+
+    The Confusion Matrix Drift test is designed to evaluate changes in the model's error patterns
+    over time. By comparing confusion matrix elements between reference and monitoring datasets, this
+    test helps identify whether the model maintains consistent prediction behavior in production. This
+    is crucial for understanding if the model's error patterns have shifted and whether specific types
+    of misclassifications have become more prevalent.
 
     ### Test Mechanism
-    Calculates confusion matrices for both reference and monitoring datasets and
-    compares corresponding elements to identify significant changes in model predictions.
+
+    This test proceeds by generating confusion matrices for both reference and monitoring datasets.
+    For binary classification, it tracks True Positives, True Negatives, False Positives, and False
+    Negatives as percentages of total predictions. For multiclass problems, it analyzes per-class
+    metrics including true positives and error rates. The test quantifies drift as percentage changes
+    in these metrics between datasets, providing detailed insight into shifting prediction patterns.
 
     ### Signs of High Risk
-    - Large drifts in confusion matrix elements (above threshold)
-    - Significant changes in error patterns (FP, FN)
+
+    - Large drifts in confusion matrix elements exceeding threshold
+    - Systematic changes in false positive or false negative rates
     - Inconsistent changes across different classes
+    - Significant shifts in error patterns for specific classes
+    - Unexpected improvements in certain metrics
+    - Divergent trends between different types of errors
+
+    ### Strengths
+
+    - Provides detailed analysis of prediction behavior
+    - Identifies specific types of prediction changes
+    - Enables early detection of systematic errors
+    - Includes comprehensive error pattern analysis
+    - Supports both binary and multiclass problems
+    - Maintains interpretable percentage-based metrics
+
+    ### Limitations
+
+    - May be sensitive to class distribution changes
+    - Cannot identify root causes of prediction drift
+    - Requires sufficient samples for reliable comparison
+    - Limited to hard predictions (not probabilities)
+    - May not capture subtle changes in decision boundaries
+    - Complex interpretation for multiclass problems
     """
     # Get predictions and true values for reference dataset
     y_pred_ref = datasets[0].y_pred(model)
diff --git a/validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py b/validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py
index 6d6c228f8..0ad7100b8 100644
--- a/validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py
+++ b/validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py
@@ -17,22 +17,51 @@ def CumulativePredictionProbabilitiesDrift(
     model: VMModel,
 ):
     """
-    Compares cumulative prediction probability distributions between reference and
-    monitoring datasets for each class.
+    Compares cumulative prediction probability distributions between reference and monitoring datasets.
 
     ### Purpose
-    This test visualizes changes in the model's cumulative probability predictions between
-    reference and monitoring datasets by comparing their distributions for each class.
+
+    The Cumulative Prediction Probabilities Drift test is designed to evaluate changes in the model's
+    probability predictions over time. By comparing cumulative distribution functions of predicted
+    probabilities between reference and monitoring datasets, this test helps identify whether the
+    model's probability assignments remain stable in production. This is crucial for understanding if
+    the model's risk assessment behavior has shifted and whether its probability calibration remains
+    consistent.
 
     ### Test Mechanism
-    For each class, creates a figure with two subplots:
-    1. Cumulative distributions comparison
-    2. Difference between monitoring and reference distributions
+
+    This test proceeds by generating cumulative distribution functions (CDFs) of predicted probabilities
+    for both reference and monitoring datasets. For each class, it plots the cumulative proportion of
+    predictions against probability values, enabling direct comparison of probability distributions.
+    The test visualizes both the CDFs and their differences, providing insight into how probability
+    assignments have shifted across the entire probability range.
 
     ### Signs of High Risk
-    - Significant shifts in cumulative distributions
-    - Large differences between reference and monitoring curves
-    - Systematic differences across probability ranges
+
+    - Large gaps between reference and monitoring CDFs
+    - Systematic shifts in probability assignments
+    - Concentration of differences in specific probability ranges
+    - Changes in the shape of probability distributions
+    - Unexpected patterns in cumulative differences
+    - Significant shifts in probability thresholds
+
+    ### Strengths
+
+    - Provides comprehensive view of probability changes
+    - Identifies specific probability ranges with drift
+    - Enables visualization of distribution differences
+    - Supports analysis across multiple classes
+    - Maintains interpretable probability scale
+    - Captures subtle changes in probability assignments
+
+    ### Limitations
+
+    - Does not provide single drift metric
+    - May be complex to interpret for multiple classes
+    - Cannot suggest probability recalibration
+    - Requires visual inspection for assessment
+    - Sensitive to sample size differences
+    - May not capture class-specific calibration issues
     """
     # Get predictions and true values
     y_prob_ref = datasets[0].y_prob(model)
diff --git a/validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py b/validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py
index b975526fe..d519d5af6 100644
--- a/validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py
+++ b/validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py
@@ -21,24 +21,51 @@ def PredictionProbabilitiesHistogramDrift(
     drift_pct_threshold: float = 20.0,
 ):
     """
-    Compares prediction probability distributions between reference and monitoring datasets
-    for each class.
+    Compares prediction probability distributions between reference and monitoring datasets.
 
     ### Purpose
-    This test visualizes and quantifies changes in the model's probability predictions
-    between reference and monitoring datasets by comparing their distributions for each class.
+
+    The Prediction Probabilities Histogram Drift test is designed to evaluate changes in the model's
+    probability predictions over time. By comparing probability distributions between reference and
+    monitoring datasets using histograms, this test helps identify whether the model's probability
+    assignments have shifted in production. This is crucial for understanding if the model's risk
+    assessment behavior remains consistent and whether its probability estimates maintain their
+    original distribution patterns.
 
     ### Test Mechanism
-    - Creates histograms of prediction probabilities for each class
-    - Superimposes reference and monitoring distributions
-    - Computes distribution moments and their drift
-    - Uses separate subplots for each class for clear comparison
+
+    This test proceeds by generating histograms of prediction probabilities for both reference and
+    monitoring datasets. For each class, it analyzes the distribution shape, central tendency, and
+    spread of probabilities. The test computes distribution moments (mean, variance, skewness,
+    kurtosis) and quantifies their drift between datasets. Visual comparison of overlaid histograms
+    provides immediate insight into distribution changes.
 
     ### Signs of High Risk
-    - Significant shifts in probability distributions
-    - Changes in the shape of distributions
-    - New modes or peaks appearing in monitoring data
-    - Large differences in distribution moments
+
+    - Significant shifts in probability distribution shapes
+    - Large drifts in distribution moments exceeding threshold
+    - Appearance of new modes or peaks in monitoring data
+    - Changes in the spread or concentration of probabilities
+    - Systematic shifts in probability assignments
+    - Unexpected changes in distribution characteristics
+
+    ### Strengths
+
+    - Provides intuitive visualization of probability changes
+    - Identifies specific changes in distribution shape
+    - Enables quantitative assessment of distribution drift
+    - Supports analysis across multiple classes
+    - Includes comprehensive moment analysis
+    - Maintains interpretable probability scale
+
+    ### Limitations
+
+    - May be sensitive to binning choices
+    - Requires sufficient samples for reliable histograms
+    - Cannot suggest probability recalibration
+    - Complex interpretation for multiple classes
+    - May not capture subtle distribution changes
+    - Limited to univariate probability analysis
     """
     # Get predictions and true values
     y_prob_ref = datasets[0].y_prob(model)
diff --git a/validmind/tests/ongoing_monitoring/ROCCurveDrift.py b/validmind/tests/ongoing_monitoring/ROCCurveDrift.py
index c8d29459e..3f783a563 100644
--- a/validmind/tests/ongoing_monitoring/ROCCurveDrift.py
+++ b/validmind/tests/ongoing_monitoring/ROCCurveDrift.py
@@ -24,18 +24,48 @@ def ROCCurveDrift(datasets: List[VMDataset], model: VMModel):
     Compares ROC curves between reference and monitoring datasets.
 
     ### Purpose
-    This test visualizes the differences in ROC curves and AUC scores between reference
-    and monitoring datasets to identify changes in model's discriminative ability.
+
+    The ROC Curve Drift test is designed to evaluate changes in the model's discriminative ability
+    over time. By comparing Receiver Operating Characteristic (ROC) curves between reference and
+    monitoring datasets, this test helps identify whether the model maintains its ability to
+    distinguish between classes across different decision thresholds. This is crucial for
+    understanding if the model's trade-off between sensitivity and specificity remains stable
+    in production.
 
     ### Test Mechanism
-    Generates two plots:
-    1. Superimposed ROC curves for both datasets
-    2. Difference between ROC curves (Monitoring - Reference)
+
+    This test proceeds by generating ROC curves for both reference and monitoring datasets. For each
+    dataset, it plots the True Positive Rate against the False Positive Rate across all possible
+    classification thresholds. The test also computes AUC scores and visualizes the difference
+    between ROC curves, providing both graphical and numerical assessments of discrimination
+    stability. Special attention is paid to regions where curves diverge significantly.
 
     ### Signs of High Risk
-    - Large differences between ROC curves
+
+    - Large differences between reference and monitoring ROC curves
     - Significant drop in AUC score for monitoring dataset
     - Systematic differences in specific FPR regions
+    - Changes in optimal operating points
+    - Inconsistent performance across different thresholds
+    - Unexpected crossovers between curves
+
+    ### Strengths
+
+    - Provides comprehensive view of discriminative ability
+    - Identifies specific threshold ranges with drift
+    - Enables visualization of performance differences
+    - Includes AUC comparison for overall assessment
+    - Supports threshold-independent evaluation
+    - Maintains interpretable performance metrics
+
+    ### Limitations
+
+    - Limited to binary classification problems
+    - May be sensitive to class distribution changes
+    - Cannot suggest optimal threshold adjustments
+    - Requires visual inspection for detailed analysis
+    - Complex interpretation of curve differences
+    - May not capture subtle performance changes
     """
     # Check for binary classification
     if len(np.unique(datasets[0].y)) > 2:
diff --git a/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py b/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py
index 8599c6894..c820b7e4b 100644
--- a/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py
+++ b/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py
@@ -19,25 +19,52 @@ def ScoreBandsDrift(
     drift_threshold: float = 20.0,
 ):
     """
-    Analyzes drift in population distribution and default rates across score bands between
-    reference and monitoring datasets.
+    Analyzes drift in population distribution and default rates across score bands.
 
     ### Purpose
-    This test evaluates changes in score band metrics between reference and monitoring
-    datasets to identify potential drift in model behavior.
+
+    The Score Bands Drift test is designed to evaluate changes in score-based risk segmentation
+    over time. By comparing population distribution and default rates across score bands between
+    reference and monitoring datasets, this test helps identify whether the model's risk
+    stratification remains stable in production. This is crucial for understanding if the model's
+    scoring behavior maintains its intended risk separation and whether specific score ranges
+    have experienced significant shifts.
 
     ### Test Mechanism
-    Compares three key metrics across score bands:
-    1. Population distribution (%)
-    2. Predicted default rates (%)
-    3. Observed default rates (%)
-    Calculates drift percentages and flags significant changes.
+
+    This test proceeds by segmenting scores into predefined bands and analyzing three key metrics
+    across these bands: population distribution, predicted default rates, and observed default
+    rates. For each band, it computes these metrics for both reference and monitoring datasets
+    and quantifies drift as percentage changes. The test provides both detailed band-by-band
+    comparisons and overall stability assessment, with special attention to bands showing
+    significant drift.
 
     ### Signs of High Risk
+
     - Large shifts in population distribution across bands
-    - Significant changes in default rates
-    - Inconsistent drift patterns across bands
-    - Multiple metrics showing high drift simultaneously
+    - Significant changes in default rates within bands
+    - Inconsistent drift patterns between adjacent bands
+    - Divergence between predicted and observed rates
+    - Systematic shifts in risk concentration
+    - Empty or sparse score bands in monitoring data
+
+    ### Strengths
+
+    - Provides comprehensive view of score-based drift
+    - Identifies specific score ranges with instability
+    - Enables comparison of multiple risk metrics
+    - Includes both distribution and performance drift
+    - Supports business-relevant score segmentation
+    - Maintains interpretable drift thresholds
+
+    ### Limitations
+
+    - Sensitive to choice of score band boundaries
+    - Requires sufficient samples in each band
+    - Cannot suggest optimal band adjustments
+    - May not capture within-band distribution changes
+    - Limited to predefined scoring metrics
+    - Complex interpretation with multiple drift signals
     """
     # Validate score column
     if score_column not in datasets[0].df.columns:
diff --git a/validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py b/validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py
index 939a69d12..fe82416eb 100644
--- a/validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py
+++ b/validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py
@@ -24,20 +24,48 @@ def ScorecardHistogramDrift(
     Compares score distributions between reference and monitoring datasets for each class.
 
     ### Purpose
-    This test visualizes and quantifies changes in the model's scoring between reference
-    and monitoring datasets by comparing their distributions for each class.
+
+    The Scorecard Histogram Drift test is designed to evaluate changes in the model's scoring
+    patterns over time. By comparing score distributions between reference and monitoring datasets
+    for each class, this test helps identify whether the model's scoring behavior remains stable
+    in production. This is crucial for understanding if the model's risk assessment maintains
+    consistent patterns and whether specific score ranges have experienced significant shifts
+    in their distribution.
 
     ### Test Mechanism
-    - Creates histograms of scores for each class
-    - Superimposes reference and monitoring distributions
-    - Computes distribution moments and their drift
-    - Uses separate subplots for each class for clear comparison
+
+    This test proceeds by generating histograms of scores for each class in both reference and
+    monitoring datasets. It analyzes distribution characteristics through multiple statistical
+    moments: mean, variance, skewness, and kurtosis. The test quantifies drift as percentage
+    changes in these moments between datasets, providing both visual and numerical assessments
+    of distribution stability. Special attention is paid to class-specific distribution changes.
 
     ### Signs of High Risk
-    - Significant shifts in score distributions
-    - Changes in the shape of distributions
-    - New modes or peaks appearing in monitoring data
-    - Large differences in distribution moments
+
+    - Significant shifts in score distribution shapes
+    - Large drifts in distribution moments exceeding threshold
+    - Changes in the relative positioning of class distributions
+    - Appearance of new modes or peaks in monitoring data
+    - Unexpected changes in score spread or concentration
+    - Systematic shifts in class-specific scoring patterns
+
+    ### Strengths
+
+    - Provides class-specific distribution analysis
+    - Identifies detailed changes in scoring patterns
+    - Enables visual comparison of distributions
+    - Includes comprehensive moment analysis
+    - Supports multiple class evaluation
+    - Maintains interpretable score scale
+
+    ### Limitations
+
+    - Sensitive to binning choices in visualization
+    - Requires sufficient samples per class
+    - Cannot suggest score adjustments
+    - May not capture subtle distribution changes
+    - Complex interpretation with multiple classes
+    - Limited to univariate score analysis
     """
     # Verify score column exists
     if score_column not in datasets[0].df.columns:

From cb183531b8f65acff51bc58537cc407afab59e82 Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 10 Jan 2025 20:39:09 +0100
Subject: [PATCH 13/14] Fix lint

---
 validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py b/validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py
index f11cee520..a55879ea0 100644
--- a/validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py
+++ b/validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py
@@ -4,7 +4,7 @@
 
 import pandas as pd
 import plotly.graph_objs as go
-from typing import List, Tuple
+from typing import List
 from validmind import tags, tasks
 from validmind.vm_models import VMDataset
 from validmind.errors import SkipTestError

From 13bd1d9475b632870187ae7fde560d2a42f81462 Mon Sep 17 00:00:00 2001
From: Juan <juan@validmind.ai>
Date: Fri, 10 Jan 2025 21:19:30 +0100
Subject: [PATCH 14/14] 2.7.6

---
 pyproject.toml           | 2 +-
 validmind/__version__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 01e761ab0..60a40a5c6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ description = "ValidMind Library"
 license = "Commercial License"
 name = "validmind"
 readme = "README.pypi.md"
-version = "2.7.5"
+version = "2.7.6"
 
 [tool.poetry.dependencies]
 aiohttp = {extras = ["speedups"], version = "*"}
diff --git a/validmind/__version__.py b/validmind/__version__.py
index a42f83a95..7b5077818 100644
--- a/validmind/__version__.py
+++ b/validmind/__version__.py
@@ -1 +1 @@
-__version__ = "2.7.5"
+__version__ = "2.7.6"