Skip to content
6 changes: 6 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,9 @@ Robust
robust.RobustWeightedClassifier
robust.RobustWeightedRegressor
robust.RobustWeightedKMeans

.. autosummary::
:toctree: generated/
:template: function.rst

robust.make_huber_metric
21 changes: 21 additions & 0 deletions doc/modules/robust.rst
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,23 @@ This algorithm has been studied in the context of "mom" weights in the
article [1]_, the context of "huber" weights has been mentioned in [2]_.
Both weighting schemes can be seen as special cases of the algorithm in [3]_.


Robust model selection
----------------------

One of the big challenge of robust machine learning is that the usual scoring
scheme (cross_validation with mean squared error for instance) is not robust.
Indeed, if the dataset has some outliers, then the test sets in cross-validation
may have outliers and then the cross_validation MSE would give us a huge error
for our robust algorithm on any corrupted data.

To solve this problem, one can use robust score methods when doing
cross-validation using `make_huber_metric`. See the following example:

:ref:`../auto_examples/robust/plot_robust_cv_example.html`

This type of robust cross-validation was mentioned for instance in [4]_.

Comparison with other robust estimators
---------------------------------------

Expand Down Expand Up @@ -203,3 +220,7 @@ the example with California housing real dataset, for further discussion.
.. [3] Stanislav Minsker and Timothée Mathieu.
`"Excess risk bounds in robust empirical risk minimization" <https://arxiv.org/abs/1910.07485>`_
arXiv preprint (2019). arXiv:1910.07485.

.. [4] Elvezio Ronchetti , Christopher Field & Wade Blanchard
`" Robust Linear Model Selection by Cross-Validation" <https://www.tandfonline.com/doi/abs/10.1080/01621459.1997.10474057>_
Journal of the American Statistical Association (1995).
55 changes: 55 additions & 0 deletions examples/robust/plot_robust_cv_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-
"""
================================================================
An example of a robust cross-validation evaluation in regression
================================================================
In this example we compare `LinearRegression` (OLS) with `HuberRegressor` from
scikit-learn using cross-validation.

We show that a robust cross-validation scheme gives a better
evaluation of the generalisation error in a corrupted dataset.
"""
print(__doc__)

import numpy as np
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn_extra.robust import make_huber_metric
from sklearn.linear_model import LinearRegression, HuberRegressor

robust_mse = make_huber_metric(mean_squared_error, c=9)
rng = np.random.RandomState(42)

X = rng.uniform(size=100)[:, np.newaxis]
y = 3 * X.ravel()
# Remark y <= 3

y[[42 // 2, 42, 42 * 2]] = 200 # outliers

print("Non robust error:")
for reg in [LinearRegression(), HuberRegressor()]:
print(
reg,
" mse : %.2F"
% (
np.mean(
cross_val_score(
reg, X, y, scoring=make_scorer(mean_squared_error)
)
)
),
)


print("\n")
print("Robust error:")
for reg in [LinearRegression(), HuberRegressor()]:
print(
reg,
" mse : %.2F"
% (
np.mean(
cross_val_score(reg, X, y, scoring=make_scorer(robust_mse))
)
),
)
3 changes: 3 additions & 0 deletions sklearn_extra/robust/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
RobustWeightedKMeans,
RobustWeightedRegressor,
)
from sklearn_extra.robust.mean_estimators import huber, make_huber_metric

__all__ = [
"RobustWeightedClassifier",
"RobustWeightedKMeans",
"RobustWeightedRegressor",
"huber",
"make_huber_metric",
]
60 changes: 60 additions & 0 deletions sklearn_extra/robust/mean_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
# License: BSD 3 clause

import numpy as np
from scipy.stats import iqr
from sklearn.metrics import mean_squared_error


def block_mom(X, k, random_state):
Expand Down Expand Up @@ -136,3 +138,61 @@ def psisx(x, c):
# new weights.
mu = np.sum(np.array(w[ind_pos]) * x[ind_pos]) / np.sum(w[ind_pos])
return mu


def make_huber_metric(score_func=mean_squared_error, c=None, T=20):
"""
Make a robust metric using Huber estimator.

Parameters
----------

score_func : callable
Score function (or loss function) with signature
``score_func(y, y_pred, **kwargs)``.

c : float >0, default = 1.35
parameter that control the robustness of the estimator.
c going to zero gives a behavior close to the median.
c going to infinity gives a behavior close to sample mean.
if c is None, the iqr is used as heuristic.

T : int, default = 20
Number of iterations of the algorithm.

Return
------

Robust metric function, a callable with signature
``score_func(y, y_pred, **kwargs).

Examples
--------

>>> import numpy as np
>>> from sklearn.metrics import mean_squared_error
>>> from sklearn_extra.robust import make_huber_metric
>>> robust_mse = make_huber_metric(mean_squared_error, c=5)
>>> y_true = np.hstack([np.zeros(98), 20*np.ones(2)]) # corrupted test values
>>> np.random.shuffle(y_true) # shuffle them
>>> y_pred = np.zeros(100) # predicted values
>>> robust_mse(y_true, y_pred)
0.1020408163265306
"""

def metric(y_true, y_pred):
# change size in order to use the raw multisample
# to have individual values
y1 = [y_true]
y2 = [y_pred]
values = score_func(y1, y2, multioutput="raw_values")
if c is None:
c_ = iqr(values)
else:
c_ = c
if c_ == 0:
return np.median(values)
else:
return huber(values, c_, T)

return metric
16 changes: 15 additions & 1 deletion sklearn_extra/robust/tests/test_mean_estimators.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import numpy as np
import pytest

from sklearn_extra.robust.mean_estimators import median_of_means, huber
from sklearn_extra.robust.mean_estimators import (
median_of_means,
huber,
make_huber_metric,
)
from sklearn.metrics import mean_squared_error


rng = np.random.RandomState(42)
Expand Down Expand Up @@ -29,3 +34,12 @@ def test_huber():
with pytest.warns(None) as record:
huber(X)
assert len(record) == 0


def test_robust_metric():
robust_mse = make_huber_metric(mean_squared_error, c=5)
y_true = np.hstack([np.zeros(95), 20 * np.ones(5)])
np.random.shuffle(y_true)
y_pred = np.zeros(100)

assert robust_mse(y_true, y_pred) < 1