Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
723 changes: 723 additions & 0 deletions notebooks/how_to/assign_score_complete_tutorial.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ description = "ValidMind Library"
license = "Commercial License"
name = "validmind"
readme = "README.pypi.md"
version = "2.8.31"
version = "2.9.0"

[tool.poetry.dependencies]
aiohttp = {extras = ["speedups"], version = "*"}
Expand Down
295 changes: 295 additions & 0 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,301 @@ def test_assign_predictions_with_invalid_predict_fn(self):

self.assertIn("FunctionModel requires a callable predict_fn", str(context.exception))

def test_assign_scores_single_metric(self):
"""
Test assigning a single metric score to dataset
"""
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "y": [0, 1, 0]})
vm_dataset = DataFrameDataset(
raw_dataset=df, target_column="y", feature_columns=["x1", "x2"]
)

# Train a simple model
model = LogisticRegression()
model.fit(vm_dataset.x, vm_dataset.y.ravel())
vm_model = init_model(input_id="test_model", model=model, __log=False)

# Assign predictions first (required for unit metrics)
vm_dataset.assign_predictions(model=vm_model)

# Test assign_scores with single metric
vm_dataset.assign_scores(vm_model, "F1")

# Check that the metric column was added
expected_column = f"{vm_model.input_id}_F1"
self.assertTrue(expected_column in vm_dataset.df.columns)

# Verify the column has the same value for all rows (scalar metric)
metric_values = vm_dataset.df[expected_column]
self.assertEqual(metric_values.nunique(), 1, "All rows should have the same metric value")

# Verify the value is reasonable for F1 score (between 0 and 1)
f1_value = metric_values.iloc[0]
self.assertTrue(0 <= f1_value <= 1, f"F1 score should be between 0 and 1, got {f1_value}")

def test_assign_scores_multiple_metrics(self):
"""
Test assigning multiple metric scores to dataset
"""
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "y": [0, 1, 0]})
vm_dataset = DataFrameDataset(
raw_dataset=df, target_column="y", feature_columns=["x1", "x2"]
)

# Train a simple model
model = LogisticRegression()
model.fit(vm_dataset.x, vm_dataset.y.ravel())
vm_model = init_model(input_id="test_model", model=model, __log=False)

# Assign predictions first
vm_dataset.assign_predictions(model=vm_model)

# Test assign_scores with multiple metrics
metrics = ["F1", "Precision", "Recall"]
vm_dataset.assign_scores(vm_model, metrics)

# Check that all metric columns were added
for metric in metrics:
expected_column = f"{vm_model.input_id}_{metric}"
self.assertTrue(expected_column in vm_dataset.df.columns)

# Verify each column has the same value for all rows
metric_values = vm_dataset.df[expected_column]
self.assertEqual(metric_values.nunique(), 1, f"All rows should have the same {metric} value")

# Verify the value is reasonable (between 0 and 1 for these metrics)
metric_value = metric_values.iloc[0]
self.assertTrue(0 <= metric_value <= 1, f"{metric} should be between 0 and 1, got {metric_value}")

def test_assign_scores_with_parameters(self):
"""
Test assigning metric scores with custom parameters
"""
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "y": [0, 1, 0]})
vm_dataset = DataFrameDataset(
raw_dataset=df, target_column="y", feature_columns=["x1", "x2"]
)

# Train a simple model
model = LogisticRegression()
model.fit(vm_dataset.x, vm_dataset.y.ravel())
vm_model = init_model(input_id="test_model", model=model, __log=False)

# Assign predictions first
vm_dataset.assign_predictions(model=vm_model)

# Test assign_scores with parameters
vm_dataset.assign_scores(vm_model, "ROC_AUC", **{"average": "weighted"})

# Check that the metric column was added
expected_column = f"{vm_model.input_id}_ROC_AUC"
self.assertTrue(expected_column in vm_dataset.df.columns)

# Verify the value is reasonable for ROC AUC (between 0 and 1)
roc_values = vm_dataset.df[expected_column]
roc_value = roc_values.iloc[0]
self.assertTrue(0 <= roc_value <= 1, f"ROC AUC should be between 0 and 1, got {roc_value}")

def test_assign_scores_full_metric_id(self):
"""
Test assigning scores using full metric IDs
"""
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "y": [0, 1, 0]})
vm_dataset = DataFrameDataset(
raw_dataset=df, target_column="y", feature_columns=["x1", "x2"]
)

# Train a simple model
model = LogisticRegression()
model.fit(vm_dataset.x, vm_dataset.y.ravel())
vm_model = init_model(input_id="test_model", model=model, __log=False)

# Assign predictions first
vm_dataset.assign_predictions(model=vm_model)

# Test assign_scores with full metric ID
full_metric_id = "validmind.unit_metrics.classification.Accuracy"
vm_dataset.assign_scores(vm_model, full_metric_id)

# Check that the metric column was added with correct name
expected_column = f"{vm_model.input_id}_Accuracy"
self.assertTrue(expected_column in vm_dataset.df.columns)

# Verify the value is reasonable for accuracy (between 0 and 1)
accuracy_values = vm_dataset.df[expected_column]
accuracy_value = accuracy_values.iloc[0]
self.assertTrue(0 <= accuracy_value <= 1, f"Accuracy should be between 0 and 1, got {accuracy_value}")

def test_assign_scores_regression_model(self):
"""
Test assigning metric scores for regression model
"""
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "y": [0.1, 1.2, 2.3]})
vm_dataset = DataFrameDataset(
raw_dataset=df, target_column="y", feature_columns=["x1", "x2"]
)

# Train a regression model
model = LinearRegression()
model.fit(vm_dataset.x, vm_dataset.y.ravel())
vm_model = init_model(input_id="reg_model", model=model, __log=False)

# Assign predictions first
vm_dataset.assign_predictions(model=vm_model)

# Test assign_scores with regression metrics
vm_dataset.assign_scores(vm_model, ["MeanSquaredError", "RSquaredScore"])

# Check that both metric columns were added
expected_columns = ["reg_model_MeanSquaredError", "reg_model_RSquaredScore"]
for column in expected_columns:
self.assertTrue(column in vm_dataset.df.columns)

# Verify R-squared is reasonable (can be negative, but typically between -1 and 1 for reasonable models)
r2_values = vm_dataset.df["reg_model_RSquaredScore"]
r2_value = r2_values.iloc[0]
self.assertTrue(-2 <= r2_value <= 1, f"R-squared should be reasonable, got {r2_value}")

# Verify MSE is non-negative
mse_values = vm_dataset.df["reg_model_MeanSquaredError"]
mse_value = mse_values.iloc[0]
self.assertTrue(mse_value >= 0, f"MSE should be non-negative, got {mse_value}")

def test_assign_scores_no_model_input_id(self):
"""
Test that assign_scores raises error when model has no input_id
"""
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "y": [0, 1, 0]})
vm_dataset = DataFrameDataset(
raw_dataset=df, target_column="y", feature_columns=["x1", "x2"]
)

# Create model without input_id
model = LogisticRegression()
model.fit(vm_dataset.x, vm_dataset.y.ravel())
vm_model = init_model(model=model, __log=False) # No input_id provided

# Clear the input_id to test the error case
vm_model.input_id = None

# Should raise ValueError
with self.assertRaises(ValueError) as context:
vm_dataset.assign_scores(vm_model, "F1")

self.assertIn("Model input_id must be set", str(context.exception))

def test_assign_scores_invalid_metric(self):
"""
Test that assign_scores raises error for invalid metric
"""
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "y": [0, 1, 0]})
vm_dataset = DataFrameDataset(
raw_dataset=df, target_column="y", feature_columns=["x1", "x2"]
)

# Train a simple model
model = LogisticRegression()
model.fit(vm_dataset.x, vm_dataset.y.ravel())
vm_model = init_model(input_id="test_model", model=model, __log=False)

# Assign predictions first
vm_dataset.assign_predictions(model=vm_model)

# Should raise ValueError for invalid metric
with self.assertRaises(ValueError) as context:
vm_dataset.assign_scores(vm_model, "InvalidMetricName")

self.assertIn("Metric 'InvalidMetricName' not found", str(context.exception))

def test_assign_scores_no_predictions(self):
"""
Test that assign_scores raises error when predictions haven't been assigned yet
"""
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "y": [0, 1, 0]})
vm_dataset = DataFrameDataset(
raw_dataset=df, target_column="y", feature_columns=["x1", "x2"]
)

# Train a simple model
model = LogisticRegression()
model.fit(vm_dataset.x, vm_dataset.y.ravel())
vm_model = init_model(input_id="test_model", model=model, __log=False)

# Don't assign predictions - test that assign_scores raises error
# (unit metrics require predictions to be available)
with self.assertRaises(ValueError) as context:
vm_dataset.assign_scores(vm_model, "F1")

self.assertIn("No prediction column found", str(context.exception))

def test_assign_scores_column_naming_convention(self):
"""
Test that assign_scores follows the correct column naming convention
"""
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "y": [0, 1, 0]})
vm_dataset = DataFrameDataset(
raw_dataset=df, target_column="y", feature_columns=["x1", "x2"]
)

# Train a simple model
model = LogisticRegression()
model.fit(vm_dataset.x, vm_dataset.y.ravel())
vm_model = init_model(input_id="my_special_model", model=model, __log=False)

# Assign predictions first
vm_dataset.assign_predictions(model=vm_model)

# Test multiple metrics to verify naming convention
metrics = ["F1", "Precision", "Recall"]
vm_dataset.assign_scores(vm_model, metrics)

# Verify all columns follow the naming convention: {model.input_id}_{metric_name}
for metric in metrics:
expected_column = f"my_special_model_{metric}"
self.assertTrue(expected_column in vm_dataset.df.columns,
f"Expected column '{expected_column}' not found")

def test_assign_scores_multiple_models(self):
"""
Test assigning scores from multiple models to same dataset
"""
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "y": [0, 1, 0]})
vm_dataset = DataFrameDataset(
raw_dataset=df, target_column="y", feature_columns=["x1", "x2"]
)

# Train two different models
lr_model = LogisticRegression()
lr_model.fit(vm_dataset.x, vm_dataset.y.ravel())
vm_lr_model = init_model(input_id="lr_model", model=lr_model, __log=False)

rf_model = RandomForestClassifier(n_estimators=5, random_state=42)
rf_model.fit(vm_dataset.x, vm_dataset.y.ravel())
vm_rf_model = init_model(input_id="rf_model", model=rf_model, __log=False)

# Assign predictions for both models
vm_dataset.assign_predictions(model=vm_lr_model)
vm_dataset.assign_predictions(model=vm_rf_model)

# Assign scores for both models
vm_dataset.assign_scores(vm_lr_model, "F1")
vm_dataset.assign_scores(vm_rf_model, "F1")

# Check that both metric columns exist with correct names
lr_column = "lr_model_F1"
rf_column = "rf_model_F1"

self.assertTrue(lr_column in vm_dataset.df.columns)
self.assertTrue(rf_column in vm_dataset.df.columns)

# Verify that the values might be different (different models)
lr_f1 = vm_dataset.df[lr_column].iloc[0]
rf_f1 = vm_dataset.df[rf_column].iloc[0]

# Both should be valid F1 scores
self.assertTrue(0 <= lr_f1 <= 1)
self.assertTrue(0 <= rf_f1 <= 1)


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion validmind/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.8.31"
__version__ = "2.9.0"
8 changes: 7 additions & 1 deletion validmind/tests/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,13 @@ def process(self, item: Any, result: TestResult) -> None:

class MetricOutputHandler(OutputHandler):
def can_handle(self, item: Any) -> bool:
return isinstance(item, (int, float))
# Accept individual numbers
if isinstance(item, (int, float)):
return True
# Accept lists/arrays of numbers for per-row metrics
if isinstance(item, (list, tuple, np.ndarray)):
return all(isinstance(x, (int, float, np.number)) for x in item)
return False

def process(self, item: Any, result: TestResult) -> None:
if result.metric is not None:
Expand Down
42 changes: 42 additions & 0 deletions validmind/unit_metrics/classification/individual/AbsoluteError.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

from typing import List

import numpy as np

from validmind import tags, tasks
from validmind.vm_models import VMDataset, VMModel


@tasks("classification")
@tags("classification")
def AbsoluteError(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
"""Calculates the absolute error per row for a classification model.

For classification tasks, this computes the absolute difference between
the true class labels and predicted class labels for each individual row.
For binary classification with probabilities, it can also compute the
absolute difference between true labels and predicted probabilities.

Args:
model: The classification model to evaluate
dataset: The dataset containing true labels and predictions
**kwargs: Additional parameters (unused for compatibility)

Returns:
List[float]: Per-row absolute errors as a list of float values
"""
y_true = dataset.y
y_pred = dataset.y_pred(model)

# Convert to numpy arrays and ensure same data type
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)

# For classification, compute absolute difference between true and predicted labels
absolute_errors = np.abs(y_true - y_pred)

# Return as a list of floats
return absolute_errors.astype(float).tolist()
Loading