diff --git a/sklearn/ensemble/boosting.py b/sklearn/ensemble/boosting.py new file mode 100644 index 0000000000000..ff0b2ddd53e01 --- /dev/null +++ b/sklearn/ensemble/boosting.py @@ -0,0 +1,200 @@ +""" +Algorithms for Boosting: +- Functional Gradient Descent +""" + +# Authors: James Bergstra +# License: BSD3 + +import numpy as np + +from ..utils import safe_asarray +from ..ensemble import BaseEnsemble +from ..base import RegressorMixin + + +class LossFunction(object): + """Abstract base class for various loss functions.""" + + def init_estimator(self, X, y): + pass + + def __call__(self, y, pred): + pass + + def negative_gradient(self, y, pred): + """Compute the negative gradient.""" + pass + + +class LeastSquaresError(LossFunction): + """Loss function for least squares (LS) estimation. + Terminal regions need not to be updated for least squares. """ + + def init_estimator(self): + return MeanPredictor() + + def __call__(self, y, pred): + return np.mean((y - pred) ** 2.0) + + def negative_gradient(self, y, pred): + return y - pred + + +class LeastAbsoluteError(LossFunction): + """Loss function for least absolute deviation (LAD) regression. """ + + def init_estimator(self): + return MedianPredictor() + + def __call__(self, y, pred): + return np.abs(y - pred).mean() + + def negative_gradient(self, y, pred): + return np.sign(y - pred) + + +class BinomialDeviance(LossFunction): + """Binomial deviance loss function for binary classification.""" + + def init_estimator(self): + return ClassPriorPredictor() + + def __call__(self, y, pred): + """Compute the deviance (= negative log-likelihood). """ + ## return -2.0 * np.sum(y * pred - + ## np.log(1.0 + np.exp(pred))) / y.shape[0] + + # logaddexp(0, v) == log(1.0 + exp(v)) + return -2.0 * np.sum(y * pred - + np.logaddexp(0.0, pred)) / y.shape[0] + + def negative_gradient(self, y, pred): + return y - 1.0 / (1.0 + np.exp(-pred)) + + +class FunctionalGradient(object): + def __init__(self, loss, X, y): + self.loss = loss + self.X = X + self.y = y + self.residual = np.array(y) # copies, ensures array + + def current_Xy(self): + return self.X, self.residual + + def update(self, prediction): + self.residual = self.loss.negative_gradient(self.residual, prediction) + + +class FitNIter(object): + """ + Iterations (self.next()) implement one round of functional gradient + boosting. + + + """ + def __init__(self, ensemble, fg, n_iters): + self.ensemble = ensemble + self.fg = fg + self.n_iters = n_iters + + def __iter__(self): + return self + + def next(self): + if self.n_iters == len(self.ensemble.estimators_): + raise StopIteration + base = self.ensemble._make_estimator() + X, y = self.fg.current_Xy() + base.fit(X, y) + self.fg.update(base.predict(X)) + return self + + +class GradientBoostedRegressor(BaseEnsemble, RegressorMixin): + """ + Regression Boosting via functional gradient descent. + + The algorithm is to construct a regression ensemble by using a "base + estimator" to repeatedly fit residual training error. So for example, the + first iteration fits some function f() to the original (X, y) training + data, and the second iteration fits some g() to (X, y - f(X)), the third + iterations fits some h() to (X y - f(X) - g(X)), and so on. The final + ensemble is f() + g() + h() + ... + + This procedure is equivalent to functional gradient descent when the the + training objective is to minimize mean squared error (MSE). + + For more information see e.g.: + J. H. Friedman (2002). "Stochastic Gradient Boosting", + Computational Statistics & Data Analysis. + + TODO: Mason has a good paper on the subject as well. + """ + + def __init__(self, base_estimator, n_estimators, + loss=LeastSquaresError): + super(GradientBoostedRegressor, self).__init__( + base_estimator=base_estimator, + n_estimators=n_estimators) + self.loss = loss + + def fit_iter(self, X, y): + """Create a fitting iterator for training set X, y. + + See class FitIter(). + """ + if 'int' in str(y.dtype): + raise TypeError('Regression of int-valued targets is ambiguous' + '. Please cast to float if you want to train using a ' + 'regression criterion.') + if issubclass(self.loss, LossFunction): + loss = self.loss() + else: + loss = self.loss + return FitNIter( + ensemble=self, + fg=FunctionalGradient(loss, X, y), + n_iters=self.n_estimators) + + def fit(self, X, y): + """Build a regression ensemble by funtional gradient boosting. + + Parameters + ---------- + X : array-like of shape = [n_samples, n_features] + The training input samples. + + y : array-like, shape = [n_samples] + The target values (integers that correspond to classes in + classification, real numbers in regression). + + Return + ------ + self : object + Returns self. + """ + for _ in self.fit_iter(X, y): + pass + return self + + def predict(self, X): + """Return the prediction for array-like X. + + Parameters + ---------- + X : array-like of shape = [n_samples, n_features] + Test samples. + + Return + ------ + prediction : numpy array of shape = [n_samples] + Test predictions. + + """ + rval = self.estimators_[0].predict(X) + for estimator in self.estimators_[1:]: + pred_i = estimator.predict(X) + rval += pred_i + return rval diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 63e6e7469989b..6492951417847 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -23,6 +23,11 @@ from ..tree._tree import MSE from ..tree._tree import DTYPE +from .boosting import LossFunction +from .boosting import LeastSquaresError +from .boosting import LeastAbsoluteError +from .boosting import BinomialDeviance + # ignore overflows due to exp(-pred) in BinomailDeviance np.seterr(invalid='raise', under='raise', divide='raise', over='ignore') @@ -89,20 +94,7 @@ def predict(self, X): y.fill(self.prior) return y - -class LossFunction(object): - """Abstract base class for various loss functions.""" - - def init_estimator(self, X, y): - pass - - def __call__(self, y, pred): - pass - - def negative_gradient(self, y, pred): - """Compute the negative gradient.""" - pass - +class TreeLossMixin(object): def update_terminal_regions(self, tree, X, y, residual, y_pred, learn_rate=1.0): """Update the terminal regions (=leaves) of the given tree and @@ -125,32 +117,14 @@ def _update_terminal_region(self, tree, leaf, terminal_region, X, y, pass -class LeastSquaresError(LossFunction): +class TreeLeastSquaresError(LeastSquaresError, TreeLossMixin): """Loss function for least squares (LS) estimation. Terminal regions need not to be updated for least squares. """ - def init_estimator(self): - return MeanPredictor() - - def __call__(self, y, pred): - return np.mean((y - pred) ** 2.0) - def negative_gradient(self, y, pred): - return y - pred - - -class LeastAbsoluteError(LossFunction): +class TreeLeastAbsoluteError(LeastAbsoluteError, TreeLossMixin): """Loss function for least absolute deviation (LAD) regression. """ - def init_estimator(self): - return MedianPredictor() - - def __call__(self, y, pred): - return np.abs(y - pred).mean() - - def negative_gradient(self, y, pred): - return np.sign(y - pred) - def _update_terminal_region(self, tree, leaf, terminal_region, X, y, residual, pred): """LAD updates terminal regions to median estimates. """ @@ -158,24 +132,9 @@ def _update_terminal_region(self, tree, leaf, terminal_region, X, y, pred.take(terminal_region, axis=0)) -class BinomialDeviance(LossFunction): +class TreeBinomialDeviance(BinomialDeviance, TreeLossMixin): """Binomial deviance loss function for binary classification.""" - def init_estimator(self): - return ClassPriorPredictor() - - def __call__(self, y, pred): - """Compute the deviance (= negative log-likelihood). """ - ## return -2.0 * np.sum(y * pred - - ## np.log(1.0 + np.exp(pred))) / y.shape[0] - - # logaddexp(0, v) == log(1.0 + exp(v)) - return -2.0 * np.sum(y * pred - - np.logaddexp(0.0, pred)) / y.shape[0] - - def negative_gradient(self, y, pred): - return y - 1.0 / (1.0 + np.exp(-pred)) - def _update_terminal_region(self, tree, leaf, terminal_region, X, y, residual, pred): """Make a single Newton-Raphson step. """ @@ -191,9 +150,9 @@ def _update_terminal_region(self, tree, leaf, terminal_region, X, y, tree.value[leaf, 0] = numerator / denominator -LOSS_FUNCTIONS = {'ls': LeastSquaresError, - 'lad': LeastAbsoluteError, - 'deviance': BinomialDeviance} +LOSS_FUNCTIONS = {'ls': TreeLeastSquaresError, + 'lad': TreeLeastAbsoluteError, + 'deviance': TreeBinomialDeviance} class BaseGradientBoosting(BaseEstimator): diff --git a/sklearn/ensemble/tests/test_boosting.py b/sklearn/ensemble/tests/test_boosting.py new file mode 100644 index 0000000000000..d4e98c62e1432 --- /dev/null +++ b/sklearn/ensemble/tests/test_boosting.py @@ -0,0 +1,50 @@ +from unittest import TestCase +import numpy as np +from sklearn.ensemble.boosting import GradientBoostedRegressor +from sklearn.tree import DecisionTreeRegressor +from sklearn.datasets.base import load_boston + +class TestGradientBoostedRegressor(TestCase): + def setUp(self): + self.task = load_boston() + self.base_est = DecisionTreeRegressor(max_depth=2, min_split=4) + self.boosting = GradientBoostedRegressor( + base_estimator=DecisionTreeRegressor( + max_depth=2, + min_split=4), + n_estimators=5) + + def test_fit_returns_self(self): + r = self.boosting.fit(self.task['data'], self.task['target']) + assert r is self.boosting + + def test_1_estimator_matches_base(self): + self.boosting = GradientBoostedRegressor( + base_estimator=DecisionTreeRegressor( + max_depth=2, + min_split=4), + n_estimators=1) + self.base_est.fit(self.task['data'], self.task['target']) + self.boosting.fit(self.task['data'], self.task['target']) + pred1 = self.base_est.predict(self.task['data']) + pred2 = self.boosting.predict(self.task['data']) + self.assert_(np.allclose(pred1, pred2)) + + def test_n_estimators(self): + assert len(self.boosting.estimators_) == 0 + self.boosting.fit(self.task['data'], self.task['target']) + assert len(self.boosting.estimators_) == self.boosting.n_estimators + + def test_int_y_not_implemented(self): + self.assertRaises(TypeError, + self.boosting.fit, + np.ones((4, 5)), np.arange(4).astype('int')) + + def test_mse_always_goes_down(self): + model = self.boosting + task = self.task + mse_list = [] + for fit_iter in model.fit_iter(task['data'], task['target']): + mse_list.append(np.mean(fit_iter.fg.residual ** 2)) + if len(mse_list) > 1: + self.assert_(mse_list[-1] < mse_list[-2])