From ce0f064006638b6b1d5f9f328a08f79600f3fd29 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 31 Jan 2012 16:04:13 -0500 Subject: [PATCH 01/18] wip --- asgd/naive_asgd.py | 72 +++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py index 0396344..1eda265 100644 --- a/asgd/naive_asgd.py +++ b/asgd/naive_asgd.py @@ -9,26 +9,33 @@ DEFAULT_SGD_STEP_SIZE0 = 1e-2 DEFAULT_L2_REGULARIZATION = 1e-3 -DEFAULT_N_ITERATIONS = 10 +DEFAULT_MIN_OBSERVATIONS = 1000 +DEFAULT_MAX_OBSERVATIONS = None DEFAULT_FEEDBACK = False DEFAULT_RSTATE = None DEFAULT_DTYPE = np.float32 +DEFAULT_N_ITERATIONS = 10 # -- used for OVA + class BaseASGD(object): def __init__(self, n_features, sgd_step_size0=DEFAULT_SGD_STEP_SIZE0, l2_regularization=DEFAULT_L2_REGULARIZATION, - n_iterations=DEFAULT_N_ITERATIONS, feedback=DEFAULT_FEEDBACK, - rstate=DEFAULT_RSTATE, dtype=DEFAULT_DTYPE): + min_observations=DEFAULT_MIN_OBSERVATIONS, + max_observations=DEFAULT_MAX_OBSERVATIONS, + feedback=DEFAULT_FEEDBACK, + rstate=DEFAULT_RSTATE, + dtype=DEFAULT_DTYPE): # -- assert n_features > 1 self.n_features = n_features - assert n_iterations > 0 - self.n_iterations = n_iterations + assert 0 <= min_observations <= max_observations + self.min_observations = min_observations + self.max_observations = max_observations if feedback: raise NotImplementedError("FIXME: feedback support is buggy") @@ -51,33 +58,40 @@ def __init__(self, n_features, self.asgd_step_size0 = 1 self.asgd_step_size = self.asgd_step_size0 + # -- self.n_observations = 0 + self.train_means = [] + self.recent_train_costs = [] + def fit_converged(self): + train_means = self.train_means + if len(train_means) > 1: + midpt = len(train_means) // 2 + if train_means[-1] > .99 * train_means[midpt]: + return True + return False -class NaiveBinaryASGD(BaseASGD): - def __init__(self, n_features, sgd_step_size0=DEFAULT_SGD_STEP_SIZE0, - l2_regularization=DEFAULT_L2_REGULARIZATION, - n_iterations=DEFAULT_N_ITERATIONS, feedback=DEFAULT_FEEDBACK, - rstate=DEFAULT_RSTATE, dtype=DEFAULT_DTYPE): +class BaseBinaryASGD(BaseASGD): - super(NaiveBinaryASGD, self).__init__( - n_features, - sgd_step_size0=sgd_step_size0, - l2_regularization=l2_regularization, - n_iterations=n_iterations, - feedback=feedback, - rstate=rstate, - dtype=dtype, - ) + def __init__(self, *args, **kwargs): + BaseASGD.__init__(self, *args, **kwargs) - # -- self.sgd_weights = np.zeros((n_features), dtype=dtype) self.sgd_bias = np.zeros((1), dtype=dtype) self.asgd_weights = np.zeros((n_features), dtype=dtype) self.asgd_bias = np.zeros((1), dtype=dtype) + + def decision_function(self, X): + return dot(self.asgd_weights, X.T) + self.asgd_bias + + def predict(self, X): + return np.sign(self.decision_function(X)) + +class NaiveBinaryASGD(BaseBinaryASGD): + def partial_fit(self, X, y): sgd_step_size0 = self.sgd_step_size0 @@ -96,6 +110,9 @@ def partial_fit(self, X, y): l2_regularization = self.l2_regularization n_observations = self.n_observations + train_means = self.train_means + recent_train_costs = self.recent_train_costs + min_observations = self.min_observations for obs, label in izip(X, y): @@ -110,6 +127,9 @@ def partial_fit(self, X, y): sgd_weights += sgd_step_size * label * obs sgd_bias += sgd_step_size * label + recent_train_costs.append(1 - float(margin)) + else: + recent_train_costs.append(0) # -- update asgd asgd_weights = (1 - asgd_step_size) * asgd_weights \ @@ -126,6 +146,12 @@ def partial_fit(self, X, y): sgd_step_size_scheduling_exponent) asgd_step_size = 1. / n_observations + if len(recent_train_costs) == min_observations: + train_means.append(np.mean(recent_train_costs) + + l2_regularization * np.dot( + self.asgd_weights, self.asgd_weights)) + recent_train_costs = [] + # -- self.sgd_weights = sgd_weights self.sgd_bias = sgd_bias @@ -163,12 +189,6 @@ def fit(self, X, y): return self - def decision_function(self, X): - return dot(self.asgd_weights, X.T) + self.asgd_bias - - def predict(self, X): - return np.sign(self.decision_function(X)) - class NaiveOVAASGD(BaseASGD): From dcb1a11eb5ef2061c24670ef1d49b2d1fa79f82d Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 31 Jan 2012 17:59:12 -0500 Subject: [PATCH 02/18] writing early stopping better and with tests --- asgd/naive_asgd.py | 81 ++++++++++++++++++++++--------- asgd/tests/test_auto_step_size.py | 18 ++++--- asgd/tests/test_naive_asgd.py | 51 ++++++++++++++++--- 3 files changed, 110 insertions(+), 40 deletions(-) diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py index 1eda265..8a3bec2 100644 --- a/asgd/naive_asgd.py +++ b/asgd/naive_asgd.py @@ -3,6 +3,7 @@ naive, non-optimized implementation """ +import sys import numpy as np from numpy import dot from itertools import izip @@ -10,12 +11,12 @@ DEFAULT_SGD_STEP_SIZE0 = 1e-2 DEFAULT_L2_REGULARIZATION = 1e-3 DEFAULT_MIN_OBSERVATIONS = 1000 -DEFAULT_MAX_OBSERVATIONS = None +DEFAULT_MAX_OBSERVATIONS = sys.maxint DEFAULT_FEEDBACK = False DEFAULT_RSTATE = None DEFAULT_DTYPE = np.float32 - -DEFAULT_N_ITERATIONS = 10 # -- used for OVA +DEFAULT_N_PARTIAL = 1000 +DEFAULT_FIT_TOLERANCE = 1e-2 class BaseASGD(object): @@ -25,6 +26,8 @@ def __init__(self, n_features, l2_regularization=DEFAULT_L2_REGULARIZATION, min_observations=DEFAULT_MIN_OBSERVATIONS, max_observations=DEFAULT_MAX_OBSERVATIONS, + fit_n_partial=DEFAULT_N_PARTIAL, + fit_tolerance=DEFAULT_FIT_TOLERANCE, feedback=DEFAULT_FEEDBACK, rstate=DEFAULT_RSTATE, dtype=DEFAULT_DTYPE): @@ -33,10 +36,15 @@ def __init__(self, n_features, assert n_features > 1 self.n_features = n_features - assert 0 <= min_observations <= max_observations + if not 0 <= min_observations <= max_observations: + raise ValueError('min_observations > max_observations', + (min_observations, max_observations)) self.min_observations = min_observations self.max_observations = max_observations + self.fit_n_partial = fit_n_partial + self.fit_tolerance = fit_tolerance + if feedback: raise NotImplementedError("FIXME: feedback support is buggy") self.feedback = feedback @@ -65,10 +73,10 @@ def __init__(self, n_features, def fit_converged(self): train_means = self.train_means - if len(train_means) > 1: + if len(train_means) > 2: midpt = len(train_means) // 2 - if train_means[-1] > .99 * train_means[midpt]: - return True + thresh = (1 - self.fit_tolerance) * train_means[midpt] + return train_means[-1] > thresh return False @@ -77,19 +85,20 @@ class BaseBinaryASGD(BaseASGD): def __init__(self, *args, **kwargs): BaseASGD.__init__(self, *args, **kwargs) - self.sgd_weights = np.zeros((n_features), dtype=dtype) - self.sgd_bias = np.zeros((1), dtype=dtype) - - self.asgd_weights = np.zeros((n_features), dtype=dtype) - self.asgd_bias = np.zeros((1), dtype=dtype) + self.sgd_weights = np.zeros((self.n_features,), dtype=self.dtype) + self.sgd_bias = np.asarray(0, dtype=self.dtype) + self.asgd_weights = np.zeros((self.n_features,), dtype=self.dtype) + self.asgd_bias = np.asarray(0, dtype=self.dtype) def decision_function(self, X): + X = np.asarray(X) return dot(self.asgd_weights, X.T) + self.asgd_bias def predict(self, X): return np.sign(self.decision_function(X)) + class NaiveBinaryASGD(BaseBinaryASGD): def partial_fit(self, X, y): @@ -146,11 +155,11 @@ def partial_fit(self, X, y): sgd_step_size_scheduling_exponent) asgd_step_size = 1. / n_observations - if len(recent_train_costs) == min_observations: + if len(recent_train_costs) == self.fit_n_partial: train_means.append(np.mean(recent_train_costs) + l2_regularization * np.dot( self.asgd_weights, self.asgd_weights)) - recent_train_costs = [] + self.recent_train_costs = recent_train_costs = [] # -- self.sgd_weights = sgd_weights @@ -174,19 +183,31 @@ def fit(self, X, y): assert n_features == self.n_features assert n_points == y.size - n_iterations = self.n_iterations + n_points_remaining = self.max_observations - self.n_observations + + while n_points_remaining > 0: - for i in xrange(n_iterations): + # -- every iteration will train from n_partial observations and + # then check for convergence + fit_n_partial = min(n_points_remaining, self.fit_n_partial) idx = self.rstate.permutation(n_points) - Xb = X[idx] - yb = y[idx] + Xb = X[idx[:fit_n_partial]] + yb = y[idx[:fit_n_partial]] self.partial_fit(Xb, yb) if self.feedback: + raise NotImplementedError( + 'partial_fit logic requires memory to be distinct') self.sgd_weights = self.asgd_weights self.sgd_bias = self.asgd_bias + if (self.n_observations >= self.min_observations + and self.fit_converged()): + break + + n_points_remaining -= len(Xb) + return self @@ -195,7 +216,10 @@ class NaiveOVAASGD(BaseASGD): def __init__(self, n_classes, n_features, sgd_step_size0=DEFAULT_SGD_STEP_SIZE0, l2_regularization=DEFAULT_L2_REGULARIZATION, - n_iterations=DEFAULT_N_ITERATIONS, + min_observations=DEFAULT_MIN_OBSERVATIONS, + max_observations=DEFAULT_MAX_OBSERVATIONS, + fit_n_partial=DEFAULT_N_PARTIAL, + fit_tolerance=DEFAULT_FIT_TOLERANCE, feedback=DEFAULT_FEEDBACK, rstate=DEFAULT_RSTATE, dtype=DEFAULT_DTYPE): @@ -204,7 +228,10 @@ def __init__(self, n_classes, n_features, n_features, sgd_step_size0=sgd_step_size0, l2_regularization=l2_regularization, - n_iterations=n_iterations, + min_observations=min_observations, + max_observations=max_observations, + fit_n_partial=fit_n_partial, + fit_tolerance=fit_tolerance, feedback=feedback, rstate=rstate, dtype=dtype, @@ -299,19 +326,25 @@ def fit(self, X, y): assert n_features == self.n_features assert n_points == y.size - n_iterations = self.n_iterations - for i in xrange(n_iterations): + n_points_remaining = self.max_observations - self.n_observations + + while n_points_remaining > 0: + # -- every iteration will train from n_partial observations and + # then check for convergence + fit_n_partial = min(n_points_remaining, self.fit_n_partial) idx = self.rstate.permutation(n_points) - Xb = X[idx] - yb = y[idx] + Xb = X[idx[:fit_n_partial]] + yb = y[idx[:fit_n_partial]] self.partial_fit(Xb, yb) if self.feedback: self.sgd_weights = self.asgd_weights self.sgd_bias = self.asgd_bias + n_points_remaining -= len(Xb) + return self def decision_function(self, X): diff --git a/asgd/tests/test_auto_step_size.py b/asgd/tests/test_auto_step_size.py index 89799e1..5090837 100644 --- a/asgd/tests/test_auto_step_size.py +++ b/asgd/tests/test_auto_step_size.py @@ -11,11 +11,13 @@ from test_naive_asgd import get_fake_data -def get_new_model(n_features, rstate): +def get_new_model(n_features, rstate, n_points): return BinaryASGD(n_features, rstate=rstate, sgd_step_size0=1e3, l2_regularization=1e-3, - n_iterations=5) + max_observations=5 * n_points, + min_observations=5 * n_points, + ) def test_binary_sgd_step_size0(): @@ -24,7 +26,7 @@ def test_binary_sgd_step_size0(): X, y = get_fake_data(100, n_features, rstate) - clf = get_new_model(n_features, rstate) + clf = get_new_model(n_features, rstate, 100) best = find_sgd_step_size0(clf, X, y, (.25, .5)) assert_almost_equal(best, -4.9927, decimal=4) @@ -40,21 +42,21 @@ def test_binary_fit(): rstate = RandomState(42) n_features = 20 - clf100 = get_new_model(n_features, rstate) + clf100 = get_new_model(n_features, rstate, 100) X, y = get_fake_data(100, n_features, rstate) _clf100 = binary_fit(clf100, X, y) assert _clf100 is clf100 assert_almost_equal(clf100.sgd_step_size0, 0.04812, decimal=4) # smoke test - clf1000 = get_new_model(n_features, rstate) + clf1000 = get_new_model(n_features, rstate, 1000) X, y = get_fake_data(DEFAULT_MAX_EXAMPLES, n_features, rstate) _clf1000 = binary_fit(clf1000, X, y) assert _clf1000 is clf1000 assert_almost_equal(clf1000.sgd_step_size0, 0.0047, decimal=4) # smoke test that at least it runs - clf2000 = get_new_model(n_features, rstate) + clf2000 = get_new_model(n_features, rstate, 2000) X, y = get_fake_data(2000, n_features, rstate) _clf2000 = binary_fit(clf2000, X, y) assert _clf2000 == clf2000 @@ -67,10 +69,10 @@ def test_fit_replicable(): X, y = get_fake_data(100, n_features, RandomState(4)) - m0 = get_new_model(n_features, RandomState(45)) + m0 = get_new_model(n_features, RandomState(45), 100) m0 = binary_fit(m0, X, y) - m1 = get_new_model(n_features, RandomState(45)) + m1 = get_new_model(n_features, RandomState(45), 100) m1 = binary_fit(m1, X, y) assert_array_equal(m0.sgd_weights, m1.sgd_weights) diff --git a/asgd/tests/test_naive_asgd.py b/asgd/tests/test_naive_asgd.py index 0900f87..668c073 100644 --- a/asgd/tests/test_naive_asgd.py +++ b/asgd/tests/test_naive_asgd.py @@ -1,3 +1,4 @@ +import sys from nose.tools import assert_equal, raises from nose.plugins.skip import SkipTest from numpy.testing import assert_allclose @@ -16,7 +17,8 @@ DEFAULT_ARGS = (N_FEATURES,) DEFAULT_KWARGS = dict(sgd_step_size0=1e-3, l2_regularization=1e-6, - n_iterations=4, + min_observations=4 * N_POINTS, + max_observations=4 * N_POINTS, dtype=np.float32) @@ -41,7 +43,6 @@ def get_fake_multiclass_data(n_points, n_features, n_classes, rstate): def test_naive_asgd(): - rstate = RandomState(42) X, y = get_fake_data(N_POINTS, N_FEATURES, rstate) @@ -50,6 +51,7 @@ def test_naive_asgd(): clf = BinaryASGD(*DEFAULT_ARGS, rstate=rstate, **DEFAULT_KWARGS) clf.fit(X, y) + assert clf.n_observations == clf.min_observations == clf.max_observations ytrn_preds = clf.predict(X) ytst_preds = clf.predict(Xtst) ytrn_acc = (ytrn_preds == y).mean() @@ -78,7 +80,6 @@ def test_naive_asgd_with_feedback(): def test_naive_asgd_multi_labels(): - rstate = RandomState(44) X, y = get_fake_binary_data_multi_labels(N_POINTS, N_FEATURES, rstate) @@ -87,7 +88,10 @@ def test_naive_asgd_multi_labels(): # n_classes is 2 since it is actually a binary case clf = OVAASGD(2, N_FEATURES, sgd_step_size0=1e-3, l2_regularization=1e-6, - n_iterations=4, dtype=np.float32, rstate=rstate) + min_observations=4 * N_POINTS, + max_observations=4 * N_POINTS, + dtype=np.float32, + rstate=rstate) clf.fit(X, y) ytrn_preds = clf.predict(X) ytst_preds = clf.predict(Xtst) @@ -98,7 +102,6 @@ def test_naive_asgd_multi_labels(): def test_naive_multiclass_ova_asgd(): - rstate = RandomState(45) n_classes = 10 @@ -108,7 +111,10 @@ def test_naive_multiclass_ova_asgd(): rstate) clf = OVAASGD(n_classes, N_FEATURES, sgd_step_size0=1e-3, - l2_regularization=1e-6, n_iterations=4, dtype=np.float32, + l2_regularization=1e-6, + min_observations=4 * N_POINTS, + max_observations=4 * N_POINTS, + dtype=np.float32, rstate=rstate) clf.fit(X, y) @@ -123,7 +129,6 @@ def test_naive_multiclass_ova_asgd(): def test_naive_multiclass_ova_vs_binary_asgd(): - rstate = RandomState(42) n_classes = 3 @@ -162,7 +167,6 @@ def test_naive_multiclass_ova_vs_binary_asgd(): @raises(ValueError) def test_naive_ova_asgd_wrong_labels(): - rstate = RandomState(42) n_classes = 10 @@ -174,3 +178,34 @@ def test_naive_ova_asgd_wrong_labels(): rstate=RandomState(999), **DEFAULT_KWARGS) ytrn_bad = rstate.randint(n_classes + 42, size=len(ytrn)) clf.partial_fit(Xtrn, ytrn_bad) + + +def test_early_stopping(): + rstate = RandomState(42) + + for N_POINTS in (1000, 2000): + + X, y = get_fake_data(N_POINTS, N_FEATURES, rstate) + Xtst, ytst = get_fake_data(N_POINTS, N_FEATURES, rstate) + + kwargs = dict(DEFAULT_KWARGS) + kwargs['fit_n_partial'] = 1500 + del kwargs['max_observations'] + print kwargs + print 'N_POINTS', N_POINTS + + clf = BinaryASGD(*DEFAULT_ARGS, rstate=rstate, **kwargs) + + clf.fit(X, y) + print clf.fit_n_partial + print len(clf.train_means) + print clf.n_observations + assert clf.fit_n_partial == kwargs['fit_n_partial'] + assert len(clf.train_means) == clf.n_observations / clf.fit_n_partial + assert clf.min_observations == kwargs['min_observations'] + assert clf.max_observations == sys.maxint + assert clf.n_observations >= kwargs['min_observations'] + assert clf.fit_converged() + ytrn_preds = clf.predict(X) + ytrn_acc = (ytrn_preds == y).mean() + assert ytrn_acc > 0.7 From 2e90505b1170d2ce050122480f01f445128222cf Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 31 Jan 2012 18:55:24 -0500 Subject: [PATCH 03/18] better docstring for binary_fit --- asgd/auto_step_size.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py index 3045520..3ab1209 100644 --- a/asgd/auto_step_size.py +++ b/asgd/auto_step_size.py @@ -85,7 +85,8 @@ def binary_fit( max_examples: int Maximum number of examples to use from `X` and `y` to find an - estimate of the best sgd_step_size0 + estimate of the best sgd_step_size0. N.B. That the entirety of X and y + is used for the final fit() call after the best step size has been found. Returns ------- From 0db2514f268bb4ddc2392285849cd8dd79c91860 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 31 Jan 2012 18:55:42 -0500 Subject: [PATCH 04/18] NaiveBinaryASGD assert labels in +-1 --- asgd/naive_asgd.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py index 8a3bec2..36fa1a0 100644 --- a/asgd/naive_asgd.py +++ b/asgd/naive_asgd.py @@ -103,6 +103,8 @@ class NaiveBinaryASGD(BaseBinaryASGD): def partial_fit(self, X, y): + assert np.all(y**2 == 1) + sgd_step_size0 = self.sgd_step_size0 sgd_step_size = self.sgd_step_size sgd_step_size_scheduling_exponent = \ From 2142727ae708126f72931293a12e69198408a53c Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Fri, 3 Feb 2012 20:42:08 -0500 Subject: [PATCH 05/18] early stopping tolerance in terms of abs and rel tolerance --- asgd/naive_asgd.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py index 36fa1a0..df227dc 100644 --- a/asgd/naive_asgd.py +++ b/asgd/naive_asgd.py @@ -15,8 +15,9 @@ DEFAULT_FEEDBACK = False DEFAULT_RSTATE = None DEFAULT_DTYPE = np.float32 -DEFAULT_N_PARTIAL = 1000 -DEFAULT_FIT_TOLERANCE = 1e-2 +DEFAULT_N_PARTIAL = 5000 +DEFAULT_FIT_ABS_TOLERANCE = 1e-4 +DEFAULT_FIT_REL_TOLERANCE = 1e-2 class BaseASGD(object): @@ -27,7 +28,8 @@ def __init__(self, n_features, min_observations=DEFAULT_MIN_OBSERVATIONS, max_observations=DEFAULT_MAX_OBSERVATIONS, fit_n_partial=DEFAULT_N_PARTIAL, - fit_tolerance=DEFAULT_FIT_TOLERANCE, + fit_abs_tolerance=DEFAULT_FIT_ABS_TOLERANCE, + fit_rel_tolerance=DEFAULT_FIT_REL_TOLERANCE, feedback=DEFAULT_FEEDBACK, rstate=DEFAULT_RSTATE, dtype=DEFAULT_DTYPE): @@ -43,7 +45,8 @@ def __init__(self, n_features, self.max_observations = max_observations self.fit_n_partial = fit_n_partial - self.fit_tolerance = fit_tolerance + self.fit_abs_tolerance = fit_abs_tolerance + self.fit_rel_tolerance = fit_rel_tolerance if feedback: raise NotImplementedError("FIXME: feedback support is buggy") @@ -73,9 +76,11 @@ def __init__(self, n_features, def fit_converged(self): train_means = self.train_means + rel_tol = self.fit_rel_tolerance + abs_tol = self.fit_abs_tolerance if len(train_means) > 2: midpt = len(train_means) // 2 - thresh = (1 - self.fit_tolerance) * train_means[midpt] + thresh = (1 - rel_tol) * train_means[midpt] - abs_tol return train_means[-1] > thresh return False @@ -221,7 +226,8 @@ def __init__(self, n_classes, n_features, min_observations=DEFAULT_MIN_OBSERVATIONS, max_observations=DEFAULT_MAX_OBSERVATIONS, fit_n_partial=DEFAULT_N_PARTIAL, - fit_tolerance=DEFAULT_FIT_TOLERANCE, + fit_abs_tolerance=DEFAULT_FIT_ABS_TOLERANCE, + fit_rel_tolerance=DEFAULT_FIT_REL_TOLERANCE, feedback=DEFAULT_FEEDBACK, rstate=DEFAULT_RSTATE, dtype=DEFAULT_DTYPE): @@ -233,7 +239,8 @@ def __init__(self, n_classes, n_features, min_observations=min_observations, max_observations=max_observations, fit_n_partial=fit_n_partial, - fit_tolerance=fit_tolerance, + fit_abs_tolerance=fit_abs_tolerance, + fit_rel_tolerance=fit_rel_tolerance, feedback=feedback, rstate=rstate, dtype=dtype, From 4de0d6c8520f1ad4a2d291c499a96a0541267463 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Fri, 3 Feb 2012 20:42:35 -0500 Subject: [PATCH 06/18] improvements to binary_fit --- asgd/auto_step_size.py | 49 +++++++++++++++---------------- asgd/tests/test_auto_step_size.py | 25 ++++++++-------- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py index 3ab1209..ce082b1 100644 --- a/asgd/auto_step_size.py +++ b/asgd/auto_step_size.py @@ -3,13 +3,14 @@ from scipy import optimize DEFAULT_INITIAL_RANGE = 0.25, 0.5 -DEFAULT_MAX_EXAMPLES = 1000 -DEFAULT_TOLERANCE = 0.5 +DEFAULT_MAX_EXAMPLES = 1000 # estimate stepsize from this many examples +DEFAULT_TOLERANCE = 0.01 # in logarithmic units of the training criterion DEFAULT_BRENT_OUTPUT = False + def find_sgd_step_size0( - model, X, y, + model, partial_fit_args, initial_range=DEFAULT_INITIAL_RANGE, tolerance=DEFAULT_TOLERANCE, brent_output=DEFAULT_BRENT_OUTPUT): """Use a Brent line search to find the best step size @@ -19,11 +20,8 @@ def find_sgd_step_size0( model: BinaryASGD Instance of a BinaryASGD - X: array, shape = [n_samples, n_features] - Array of features - - y: array, shape = [n_samples] - Array of labels in (-1, 1) + partial_fit_args - tuple of arguments for model.partial_fit. + This tuple must start with X, y, ... initial_range: tuple of float Initial range for the sgd_step_size0 search (low, high) @@ -48,26 +46,31 @@ def eval_size0(log2_size0): current_step_size = 2 ** log2_size0 other.sgd_step_size0 = current_step_size other.sgd_step_size = current_step_size - other.partial_fit(X, y) + other.partial_fit(*partial_fit_args) # Hack: asgd is lower variance than sgd, but it's tuned to work # well asymptotically, not after just a few examples weights = .5 * (other.asgd_weights + other.sgd_weights) bias = .5 * (other.asgd_bias + other.sgd_bias) + X, y = partial_fit_args[:2] margin = y * (np.dot(X, weights) + bias) l2_cost = other.l2_regularization * (weights ** 2).sum() rval = np.maximum(0, 1 - margin).mean() + l2_cost + if np.isnan(rval): + rval = float('inf') + # -- apply minimizer in log domain + rval = np.log(rval) _cache[log2_size0] = rval return rval - best_sgd_step_size0 = optimize.brent( + log2_best_sgd_step_size0 = optimize.brent( eval_size0, brack=np.log2(initial_range), tol=tolerance) - return best_sgd_step_size0 + return 2 ** log2_best_sgd_step_size0 def binary_fit( - model, X, y, + model, fit_args, max_examples=DEFAULT_MAX_EXAMPLES, **find_sgd_step_size0_kwargs): """Returns a model with automatically-selected sgd_step_size0 @@ -77,11 +80,8 @@ def binary_fit( model: BinaryASGD Instance of the model to be fitted. - X: array, shape = [n_samples, n_features] - Array of features - - y: array, shape = [n_samples] - Array of labels in (-1, 1) + fit_args - tuple of args to model.fit + This method assumes they are all length-of-dataset ndarrays. max_examples: int Maximum number of examples to use from `X` and `y` to find an @@ -95,23 +95,22 @@ def binary_fit( sgd_step_size0 """ - assert X.ndim == 2 - assert len(X) == len(y) assert max_examples > 0 # randomly choose up to max_examples uniformly without replacement from # across the whole set of training data. - idxs = model.rstate.permutation(len(X))[:max_examples] + all_idxs = model.rstate.permutation(len(fit_args[0])) + idxs = all_idxs[:max_examples] # Find the best learning rate for that subset best = find_sgd_step_size0( - model, X[idxs], y[idxs], **find_sgd_step_size0_kwargs) + model, [a[idxs] for a in fit_args], **find_sgd_step_size0_kwargs) # Heuristic: take the best stepsize according to the first max_examples, # and go half that fast for the full run. - best_estimate = 2. ** (best - 1.0) - model.sgd_step_size0 = best_estimate - model.sgd_step_size = best_estimate - model.fit(X, y) + stepdown = 5 * np.sqrt( float(len(all_idxs)) / float(len(idxs))) + model.sgd_step_size0 = best / stepdown + model.sgd_step_size = best / stepdown + model.fit(*fit_args) return model diff --git a/asgd/tests/test_auto_step_size.py b/asgd/tests/test_auto_step_size.py index 5090837..38e4d1a 100644 --- a/asgd/tests/test_auto_step_size.py +++ b/asgd/tests/test_auto_step_size.py @@ -27,12 +27,13 @@ def test_binary_sgd_step_size0(): X, y = get_fake_data(100, n_features, rstate) clf = get_new_model(n_features, rstate, 100) - best = find_sgd_step_size0(clf, X, y, (.25, .5)) - assert_almost_equal(best, -4.9927, decimal=4) + best0 = find_sgd_step_size0(clf, (X, y), (.25, .5), tolerance=1e-6) # start a little lower, still works - best = find_sgd_step_size0(clf, X, y, (.125, .25)) - assert_almost_equal(best, -4.6180, decimal=4) + best1 = find_sgd_step_size0(clf, (X, y), (.125, .25), tolerance=1e-6) + print best0, best1 + assert_almost_equal(best0, 0.03501, decimal=4) + assert_almost_equal(best1, 0.03501, decimal=4) # find_sgd_step_size0 does not change clf assert clf.sgd_step_size0 == 1000.0 @@ -44,23 +45,23 @@ def test_binary_fit(): clf100 = get_new_model(n_features, rstate, 100) X, y = get_fake_data(100, n_features, rstate) - _clf100 = binary_fit(clf100, X, y) + _clf100 = binary_fit(clf100, (X, y)) assert _clf100 is clf100 - assert_almost_equal(clf100.sgd_step_size0, 0.04812, decimal=4) + assert_almost_equal(clf100.sgd_step_size0, 0.00954, decimal=4) # smoke test clf1000 = get_new_model(n_features, rstate, 1000) X, y = get_fake_data(DEFAULT_MAX_EXAMPLES, n_features, rstate) - _clf1000 = binary_fit(clf1000, X, y) + _clf1000 = binary_fit(clf1000, (X, y)) assert _clf1000 is clf1000 - assert_almost_equal(clf1000.sgd_step_size0, 0.0047, decimal=4) + assert_almost_equal(clf1000.sgd_step_size0, 0.00201, decimal=4) # smoke test that at least it runs clf2000 = get_new_model(n_features, rstate, 2000) X, y = get_fake_data(2000, n_features, rstate) - _clf2000 = binary_fit(clf2000, X, y) + _clf2000 = binary_fit(clf2000, (X, y)) assert _clf2000 == clf2000 - assert_almost_equal(clf2000.sgd_step_size0, 0.0067, decimal=4) + assert_almost_equal(clf2000.sgd_step_size0, 0.001498, decimal=4) def test_fit_replicable(): @@ -70,10 +71,10 @@ def test_fit_replicable(): X, y = get_fake_data(100, n_features, RandomState(4)) m0 = get_new_model(n_features, RandomState(45), 100) - m0 = binary_fit(m0, X, y) + m0 = binary_fit(m0, (X, y)) m1 = get_new_model(n_features, RandomState(45), 100) - m1 = binary_fit(m1, X, y) + m1 = binary_fit(m1, (X, y)) assert_array_equal(m0.sgd_weights, m1.sgd_weights) assert_array_equal(m0.sgd_bias, m1.sgd_bias) From a56823eaa441dd8f4fa6ded60a825397a9dbcb44 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Fri, 24 Feb 2012 00:37:22 -0500 Subject: [PATCH 07/18] adding logger to binary_fit --- asgd/auto_step_size.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py index ce082b1..9b24c7b 100644 --- a/asgd/auto_step_size.py +++ b/asgd/auto_step_size.py @@ -1,7 +1,10 @@ import copy +import logging import numpy as np from scipy import optimize +logger = logging.getLogger(__name__) + DEFAULT_INITIAL_RANGE = 0.25, 0.5 DEFAULT_MAX_EXAMPLES = 1000 # estimate stepsize from this many examples DEFAULT_TOLERANCE = 0.01 # in logarithmic units of the training criterion @@ -105,10 +108,13 @@ def binary_fit( # Find the best learning rate for that subset best = find_sgd_step_size0( model, [a[idxs] for a in fit_args], **find_sgd_step_size0_kwargs) + logger.info('found best: %f' % best) # Heuristic: take the best stepsize according to the first max_examples, # and go half that fast for the full run. stepdown = 5 * np.sqrt( float(len(all_idxs)) / float(len(idxs))) + + logger.info('setting sgd_step_size: %f' % (best/stepdown)) model.sgd_step_size0 = best / stepdown model.sgd_step_size = best / stepdown model.fit(*fit_args) From dd5d2b98c8954b059c15f828a9d4e545693bfa53 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Fri, 24 Feb 2012 00:37:46 -0500 Subject: [PATCH 08/18] disallow learning rate smaller than 1e-7 --- asgd/auto_step_size.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py index 9b24c7b..dd97209 100644 --- a/asgd/auto_step_size.py +++ b/asgd/auto_step_size.py @@ -69,7 +69,8 @@ def eval_size0(log2_size0): log2_best_sgd_step_size0 = optimize.brent( eval_size0, brack=np.log2(initial_range), tol=tolerance) - return 2 ** log2_best_sgd_step_size0 + rval = max(2 ** log2_best_sgd_step_size0, 1e-7) + return rval def binary_fit( From ddf0560021b9547c63d6a7ee53f5ffb091d80018 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 28 Feb 2012 19:43:58 -0500 Subject: [PATCH 09/18] different line search technique for auto step size --- asgd/auto_step_size.py | 49 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py index dd97209..d230c97 100644 --- a/asgd/auto_step_size.py +++ b/asgd/auto_step_size.py @@ -1,21 +1,18 @@ import copy import logging import numpy as np -from scipy import optimize +import scipy.optimize logger = logging.getLogger(__name__) -DEFAULT_INITIAL_RANGE = 0.25, 0.5 DEFAULT_MAX_EXAMPLES = 1000 # estimate stepsize from this many examples -DEFAULT_TOLERANCE = 0.01 # in logarithmic units of the training criterion -DEFAULT_BRENT_OUTPUT = False - +DEFAULT_TOLERANCE = 1.0 # in log-2 units of the learning rate +DEFAULT_SGD_STEP_SIZE_FLOOR = 1e-7 # -- for huge feature vectors, reduce this. def find_sgd_step_size0( model, partial_fit_args, - initial_range=DEFAULT_INITIAL_RANGE, - tolerance=DEFAULT_TOLERANCE, brent_output=DEFAULT_BRENT_OUTPUT): + tolerance=DEFAULT_TOLERANCE): """Use a Brent line search to find the best step size Parameters @@ -26,15 +23,10 @@ def find_sgd_step_size0( partial_fit_args - tuple of arguments for model.partial_fit. This tuple must start with X, y, ... - initial_range: tuple of float - Initial range for the sgd_step_size0 search (low, high) - - max_iterations: - Maximum number of interations + tolerance: in logarithmic step size units Returns ------- - best_sgd_step_size0: float Optimal sgd_step_size0 given `X` and `y`. """ # -- stupid scipy calls some sizes twice!? @@ -42,7 +34,7 @@ def find_sgd_step_size0( def eval_size0(log2_size0): try: - return _cache[log2_size0] + return _cache[float(log2_size0)] except KeyError: pass other = copy.deepcopy(model) @@ -58,24 +50,30 @@ def eval_size0(log2_size0): X, y = partial_fit_args[:2] margin = y * (np.dot(X, weights) + bias) l2_cost = other.l2_regularization * (weights ** 2).sum() + #print 'Hinge:', np.maximum(0, 1 - margin).mean() + #print 'L2', l2_cost rval = np.maximum(0, 1 - margin).mean() + l2_cost if np.isnan(rval): rval = float('inf') - # -- apply minimizer in log domain - rval = np.log(rval) - _cache[log2_size0] = rval + #print 'find step %e: %e' % (current_step_size, rval) return rval - log2_best_sgd_step_size0 = optimize.brent( - eval_size0, brack=np.log2(initial_range), tol=tolerance) + log2_best_sgd_step_size0 = scipy.optimize.fmin( + eval_size0, + x0=np.log2(model.sgd_step_size0), + xtol=tolerance, + ) + + #print 'Brent came back with', log2_best_sgd_step_size0, 2 ** log2_best_sgd_step_size0 - rval = max(2 ** log2_best_sgd_step_size0, 1e-7) + rval = 2.0 ** log2_best_sgd_step_size0 return rval def binary_fit( model, fit_args, max_examples=DEFAULT_MAX_EXAMPLES, + step_size_floor=DEFAULT_SGD_STEP_SIZE_FLOOR, **find_sgd_step_size0_kwargs): """Returns a model with automatically-selected sgd_step_size0 @@ -109,15 +107,16 @@ def binary_fit( # Find the best learning rate for that subset best = find_sgd_step_size0( model, [a[idxs] for a in fit_args], **find_sgd_step_size0_kwargs) - logger.info('found best: %f' % best) + logger.info('found best: %e' % best) # Heuristic: take the best stepsize according to the first max_examples, # and go half that fast for the full run. - stepdown = 5 * np.sqrt( float(len(all_idxs)) / float(len(idxs))) + stepdown = np.sqrt( float(len(all_idxs)) / float(len(idxs))) + step_size0 = max(best / stepdown, step_size_floor) - logger.info('setting sgd_step_size: %f' % (best/stepdown)) - model.sgd_step_size0 = best / stepdown - model.sgd_step_size = best / stepdown + logger.info('setting sgd_step_size: %e' % step_size0) + model.sgd_step_size0 = float(step_size0) + model.sgd_step_size = float(step_size0) model.fit(*fit_args) return model From 47200288d633036d57bbf418e895c597a0a833c7 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 28 Feb 2012 19:44:39 -0500 Subject: [PATCH 10/18] added perfect-fit convergence condition --- asgd/naive_asgd.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py index df227dc..d14a9db 100644 --- a/asgd/naive_asgd.py +++ b/asgd/naive_asgd.py @@ -75,13 +75,34 @@ def __init__(self, n_features, self.recent_train_costs = [] def fit_converged(self): + """ + There are two convergence tests here. Training is considered to be + over if it has + + * stalled: latest train_means is allclose to a previous one (*) + + * terminated: latest train_means is allclose to 0 + + """ train_means = self.train_means - rel_tol = self.fit_rel_tolerance - abs_tol = self.fit_abs_tolerance + rtol = self.fit_rel_tolerance + atol = self.fit_abs_tolerance + assert np.min(train_means) >= 0 + + # -- check for perfect fit + if len(train_means) > 1: + if np.allclose(train_means[-1], 0, atol=atol, rtol=rtol): + return True + + # -- check for stall condition if len(train_means) > 2: - midpt = len(train_means) // 2 - thresh = (1 - rel_tol) * train_means[midpt] - abs_tol - return train_means[-1] > thresh + old_pt = max( + len(train_means) // 2, + len(train_means) - 4) + thresh = (1 - rtol) * train_means[old_pt] - atol + if train_means[-1] > thresh: + return True + return False From 73db426c54d6767591c2c652a1f3f6b18e2f77bc Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 28 Feb 2012 19:58:43 -0500 Subject: [PATCH 11/18] gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 4af44bf..a67e6ec 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.pyc *.so *.egg-info +*.swp From cf54011c10e74ff638c4f7db519b15d838deca46 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Wed, 29 Feb 2012 01:11:49 -0500 Subject: [PATCH 12/18] bug in position of new assert --- asgd/naive_asgd.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py index d14a9db..4ba5167 100644 --- a/asgd/naive_asgd.py +++ b/asgd/naive_asgd.py @@ -87,10 +87,10 @@ def fit_converged(self): train_means = self.train_means rtol = self.fit_rel_tolerance atol = self.fit_abs_tolerance - assert np.min(train_means) >= 0 # -- check for perfect fit if len(train_means) > 1: + assert np.min(train_means) >= 0 if np.allclose(train_means[-1], 0, atol=atol, rtol=rtol): return True @@ -149,7 +149,6 @@ def partial_fit(self, X, y): n_observations = self.n_observations train_means = self.train_means recent_train_costs = self.recent_train_costs - min_observations = self.min_observations for obs, label in izip(X, y): From 6c5b8483891052b03286e708c04e7408b2086856 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Mon, 19 Mar 2012 18:45:52 -0400 Subject: [PATCH 13/18] adding NaiveRankASGD --- asgd/__init__.py | 4 +- asgd/auto_step_size.py | 11 +- asgd/naive_asgd.py | 278 +++++++++++++++++++++++++++-------------- 3 files changed, 189 insertions(+), 104 deletions(-) diff --git a/asgd/__init__.py b/asgd/__init__.py index eabbd9e..08d1272 100644 --- a/asgd/__init__.py +++ b/asgd/__init__.py @@ -1,3 +1,5 @@ -from naive_asgd import NaiveBinaryASGD, NaiveOVAASGD +from naive_asgd import NaiveBinaryASGD +from naive_asgd import NaiveOVAASGD +from naive_asgd import NaiveRankASGD from experimental_asgd import ExperimentalBinaryASGD diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py index d230c97..dfb8af2 100644 --- a/asgd/auto_step_size.py +++ b/asgd/auto_step_size.py @@ -44,15 +44,11 @@ def eval_size0(log2_size0): other.partial_fit(*partial_fit_args) # Hack: asgd is lower variance than sgd, but it's tuned to work # well asymptotically, not after just a few examples - weights = .5 * (other.asgd_weights + other.sgd_weights) - bias = .5 * (other.asgd_bias + other.sgd_bias) + other.asgd_weights = .5 * (other.asgd_weights + other.sgd_weights) + other.asgd_bias = .5 * (other.asgd_bias + other.sgd_bias) X, y = partial_fit_args[:2] - margin = y * (np.dot(X, weights) + bias) - l2_cost = other.l2_regularization * (weights ** 2).sum() - #print 'Hinge:', np.maximum(0, 1 - margin).mean() - #print 'L2', l2_cost - rval = np.maximum(0, 1 - margin).mean() + l2_cost + rval = other.cost(X, y) if np.isnan(rval): rval = float('inf') #print 'find step %e: %e' % (current_step_size, rval) @@ -70,6 +66,7 @@ def eval_size0(log2_size0): return rval +# XXX: use different name, function is not specific to binary classification def binary_fit( model, fit_args, max_examples=DEFAULT_MAX_EXAMPLES, diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py index 4ba5167..259252e 100644 --- a/asgd/naive_asgd.py +++ b/asgd/naive_asgd.py @@ -105,6 +105,42 @@ def fit_converged(self): return False + def fit(self, X, y): + + assert X.ndim == 2 + assert y.ndim == 1 + + n_points, n_features = X.shape + assert n_features == self.n_features + assert n_points == y.size + + n_points_remaining = self.max_observations - self.n_observations + + while n_points_remaining > 0: + + # -- every iteration will train from n_partial observations and + # then check for convergence + fit_n_partial = min(n_points_remaining, self.fit_n_partial) + + idx = self.rstate.permutation(n_points) + Xb = X[idx[:fit_n_partial]] + yb = y[idx[:fit_n_partial]] + self.partial_fit(Xb, yb) + + if self.feedback: + raise NotImplementedError( + 'partial_fit logic requires memory to be distinct') + self.sgd_weights = self.asgd_weights + self.sgd_bias = self.asgd_bias + + if (self.n_observations >= self.min_observations + and self.fit_converged()): + break + + n_points_remaining -= len(Xb) + + return self + class BaseBinaryASGD(BaseASGD): @@ -124,6 +160,14 @@ def decision_function(self, X): def predict(self, X): return np.sign(self.decision_function(X)) + def cost(self, X, y): + weights = self.asgd_weights + bias = self.asgd_bias + margin = y * (np.dot(X, weights) + bias) + l2_cost = self.l2_regularization * (weights ** 2).sum() + rval = np.maximum(0, 1 - margin).mean() + l2_cost + return rval + class NaiveBinaryASGD(BaseBinaryASGD): @@ -201,80 +245,33 @@ def partial_fit(self, X, y): return self - def fit(self, X, y): - - assert X.ndim == 2 - assert y.ndim == 1 - - n_points, n_features = X.shape - assert n_features == self.n_features - assert n_points == y.size - - n_points_remaining = self.max_observations - self.n_observations - - while n_points_remaining > 0: - - # -- every iteration will train from n_partial observations and - # then check for convergence - fit_n_partial = min(n_points_remaining, self.fit_n_partial) - - idx = self.rstate.permutation(n_points) - Xb = X[idx[:fit_n_partial]] - yb = y[idx[:fit_n_partial]] - self.partial_fit(Xb, yb) - - if self.feedback: - raise NotImplementedError( - 'partial_fit logic requires memory to be distinct') - self.sgd_weights = self.asgd_weights - self.sgd_bias = self.asgd_bias - - if (self.n_observations >= self.min_observations - and self.fit_converged()): - break - - n_points_remaining -= len(Xb) - - return self - -class NaiveOVAASGD(BaseASGD): - - def __init__(self, n_classes, n_features, - sgd_step_size0=DEFAULT_SGD_STEP_SIZE0, - l2_regularization=DEFAULT_L2_REGULARIZATION, - min_observations=DEFAULT_MIN_OBSERVATIONS, - max_observations=DEFAULT_MAX_OBSERVATIONS, - fit_n_partial=DEFAULT_N_PARTIAL, - fit_abs_tolerance=DEFAULT_FIT_ABS_TOLERANCE, - fit_rel_tolerance=DEFAULT_FIT_REL_TOLERANCE, - feedback=DEFAULT_FEEDBACK, - rstate=DEFAULT_RSTATE, - dtype=DEFAULT_DTYPE): - - super(NaiveOVAASGD, self).__init__( - n_features, - sgd_step_size0=sgd_step_size0, - l2_regularization=l2_regularization, - min_observations=min_observations, - max_observations=max_observations, - fit_n_partial=fit_n_partial, - fit_abs_tolerance=fit_abs_tolerance, - fit_rel_tolerance=fit_rel_tolerance, - feedback=feedback, - rstate=rstate, - dtype=dtype, - ) +class BaseMultiASGD(BaseASGD): + """ + Allocate weight vectors for one-vs-all multiclass SVM + """ + def __init__(self, n_classes, n_features, *args, **kwargs): + BaseASGD.__init__(self, n_features, *args, **kwargs) # -- - assert n_classes > 1 self.n_classes = n_classes + dtype = self.dtype # -- self.sgd_weights = np.zeros((n_features, n_classes), dtype=dtype) - self.sgd_bias = np.zeros((n_classes,), dtype=dtype) + self.sgd_bias = np.zeros(n_classes, dtype=dtype) self.asgd_weights = np.zeros((n_features, n_classes), dtype=dtype) - self.asgd_bias = np.zeros((n_classes), dtype=dtype) + self.asgd_bias = np.zeros(n_classes, dtype=dtype) + + def decision_function(self, X): + X = np.asarray(X) + return dot(X, self.asgd_weights) + self.asgd_bias + + def predict(self, X): + return self.decision_function(X).argmax(axis=1) + + +class NaiveOVAASGD(BaseMultiASGD): def partial_fit(self, X, y): @@ -299,6 +296,9 @@ def partial_fit(self, X, y): n_observations = self.n_observations n_classes = self.n_classes + train_means = self.train_means + recent_train_costs = self.recent_train_costs + for obs, label in izip(X, y): label = 2 * (np.arange(n_classes) == label).astype(int) - 1 @@ -310,13 +310,19 @@ def partial_fit(self, X, y): sgd_weights *= (1 - l2_regularization * sgd_step_size) violations = margin < 1 - label_violated = label[violations] - sgd_weights[:, violations] += ( - sgd_step_size - * label_violated[np.newaxis, :] - * obs[:, np.newaxis] - ) - sgd_bias[violations] += sgd_step_size * label_violated + + if violations: + label_violated = label[violations] + sgd_weights[:, violations] += ( + sgd_step_size + * label_violated[np.newaxis, :] + * obs[:, np.newaxis] + ) + sgd_bias[violations] += sgd_step_size * label_violated + recent_train_costs.append( + n_classes - margin[violations].sum()) + else: + recent_train_costs.append(0.0) # -- update asgd asgd_weights = (1 - asgd_step_size) * asgd_weights \ @@ -333,6 +339,12 @@ def partial_fit(self, X, y): sgd_step_size_scheduling_exponent) asgd_step_size = 1. / n_observations + if len(recent_train_costs) == self.fit_n_partial: + train_means.append(np.mean(recent_train_costs) + + l2_regularization * np.dot( + self.asgd_weights, self.asgd_weights)) + self.recent_train_costs = recent_train_costs = [] + # -- self.sgd_weights = sgd_weights self.sgd_bias = sgd_bias @@ -346,38 +358,112 @@ def partial_fit(self, X, y): return self - def fit(self, X, y): - assert X.ndim == 2 - assert y.ndim == 1 +class NaiveRankASGD(BaseMultiASGD): + """ + Implements rank-based multiclass SVM. + """ - n_points, n_features = X.shape - assert n_features == self.n_features - assert n_points == y.size + def partial_fit(self, X, y): + if set(y) > set(range(self.n_classes)): + raise ValueError("Invalid 'y'") - n_points_remaining = self.max_observations - self.n_observations + sgd_step_size0 = self.sgd_step_size0 + sgd_step_size = self.sgd_step_size + sgd_step_size_scheduling_exponent = \ + self.sgd_step_size_scheduling_exponent + sgd_step_size_scheduling_multiplier = \ + self.sgd_step_size_scheduling_multiplier + sgd_weights = self.sgd_weights + sgd_bias = self.sgd_bias - while n_points_remaining > 0: - # -- every iteration will train from n_partial observations and - # then check for convergence - fit_n_partial = min(n_points_remaining, self.fit_n_partial) + asgd_weights = self.asgd_weights + asgd_bias = self.asgd_bias + asgd_step_size = self.asgd_step_size - idx = self.rstate.permutation(n_points) - Xb = X[idx[:fit_n_partial]] - yb = y[idx[:fit_n_partial]] - self.partial_fit(Xb, yb) + l2_regularization = self.l2_regularization - if self.feedback: - self.sgd_weights = self.asgd_weights - self.sgd_bias = self.asgd_bias + n_observations = self.n_observations + n_classes = self.n_classes - n_points_remaining -= len(Xb) + train_means = self.train_means + recent_train_costs = self.recent_train_costs + + for obs, label in izip(X, y): + + # -- compute margin + decision = dot(obs, sgd_weights) + sgd_bias + + # -- update sgd + if l2_regularization: + sgd_weights *= (1 - l2_regularization * sgd_step_size) + + dsrt = np.argsort(decision) + distractor = dsrt[-2] if dsrt[-1] is label else dsrt[-1] + margin = decision[label] - decision[distractor] + + if margin < 1: + winc = sgd_step_size * obs + sgd_weights[:, distractor] -= winc + sgd_weights[:, label] += winc + sgd_bias[distractor] -= sgd_step_size + sgd_bias[label] += sgd_step_size + recent_train_costs.append(1 - float(margin)) + else: + recent_train_costs.append(0.0) + + # -- update asgd + # -- TODO: delay updates until sgd_weights are modified, + # only 2 rows change per iteration + asgd_weights = (1 - asgd_step_size) * asgd_weights \ + + asgd_step_size * sgd_weights + asgd_bias = (1 - asgd_step_size) * asgd_bias \ + + asgd_step_size * sgd_bias + + # -- update step_sizes + n_observations += 1 + sgd_step_size_scheduling = (1 + sgd_step_size0 * n_observations * + sgd_step_size_scheduling_multiplier) + sgd_step_size = sgd_step_size0 / \ + (sgd_step_size_scheduling ** \ + sgd_step_size_scheduling_exponent) + asgd_step_size = 1. / n_observations + + if len(recent_train_costs) == self.fit_n_partial: + flat_weights = asgd_weights.flatten() + l2_term = np.dot(flat_weights, flat_weights) + train_means.append(np.mean(recent_train_costs) + + l2_regularization * l2_term) + self.recent_train_costs = recent_train_costs = [] + + # -- + self.sgd_weights = sgd_weights + self.sgd_bias = sgd_bias + self.sgd_step_size = sgd_step_size + + self.asgd_weights = asgd_weights + self.asgd_bias = asgd_bias + self.asgd_step_size = asgd_step_size + + self.n_observations = n_observations return self - def decision_function(self, X): - return dot(X, self.asgd_weights) + self.asgd_bias - def predict(self, X): - return self.decision_function(X).argmax(1) + def cost(self, X, y): + weights = self.asgd_weights + bias = self.asgd_bias + decisions = dot(X, weights) + bias + margins = [] + for decision, label in zip(decisions, y): + dsrt = np.argsort(decision) + distractor = dsrt[-2] if dsrt[-1] is label else dsrt[-1] + margin = decision[label] - decision[distractor] + margins.append(margin) + + flat_weights = weights.flatten() + l2_term = np.dot(flat_weights, flat_weights) + l2_cost = self.l2_regularization * l2_term + rval = np.maximum(0, 1 - np.asarray(margins)).mean() + l2_cost + return rval From 23a859701dd92a362d9a09b0f9e8aeab251ce238 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 20 Mar 2012 16:48:59 -0400 Subject: [PATCH 14/18] adding SparseUpdateASGD --- asgd/__init__.py | 1 + asgd/auto_step_size.py | 7 +- asgd/naive_asgd.py | 176 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 173 insertions(+), 11 deletions(-) diff --git a/asgd/__init__.py b/asgd/__init__.py index 08d1272..b9a13ea 100644 --- a/asgd/__init__.py +++ b/asgd/__init__.py @@ -1,5 +1,6 @@ from naive_asgd import NaiveBinaryASGD from naive_asgd import NaiveOVAASGD from naive_asgd import NaiveRankASGD +from naive_asgd import SparseUpdateRankASGD from experimental_asgd import ExperimentalBinaryASGD diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py index dfb8af2..f7d804e 100644 --- a/asgd/auto_step_size.py +++ b/asgd/auto_step_size.py @@ -1,5 +1,6 @@ import copy import logging +import time import numpy as np import scipy.optimize @@ -102,9 +103,11 @@ def binary_fit( idxs = all_idxs[:max_examples] # Find the best learning rate for that subset + t0 = time.time() best = find_sgd_step_size0( model, [a[idxs] for a in fit_args], **find_sgd_step_size0_kwargs) - logger.info('found best: %e' % best) + logger.info('found best stepsize %e in %f seconds' % ( + best, time.time() - t0)) # Heuristic: take the best stepsize according to the first max_examples, # and go half that fast for the full run. @@ -114,6 +117,8 @@ def binary_fit( logger.info('setting sgd_step_size: %e' % step_size0) model.sgd_step_size0 = float(step_size0) model.sgd_step_size = float(step_size0) + t0 = time.time() model.fit(*fit_args) + logger.info('full fit took %f seconds' % (time.time() - t0)) return model diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py index 259252e..b73a671 100644 --- a/asgd/naive_asgd.py +++ b/asgd/naive_asgd.py @@ -365,7 +365,6 @@ class NaiveRankASGD(BaseMultiASGD): """ def partial_fit(self, X, y): - if set(y) > set(range(self.n_classes)): raise ValueError("Invalid 'y'") @@ -395,14 +394,14 @@ def partial_fit(self, X, y): # -- compute margin decision = dot(obs, sgd_weights) + sgd_bias - # -- update sgd - if l2_regularization: - sgd_weights *= (1 - l2_regularization * sgd_step_size) - dsrt = np.argsort(decision) - distractor = dsrt[-2] if dsrt[-1] is label else dsrt[-1] + distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1] margin = decision[label] - decision[distractor] + # -- update sgd + if l2_regularization: + sgd_weights *= 1 - l2_regularization * sgd_step_size + if margin < 1: winc = sgd_step_size * obs sgd_weights[:, distractor] -= winc @@ -414,8 +413,6 @@ def partial_fit(self, X, y): recent_train_costs.append(0.0) # -- update asgd - # -- TODO: delay updates until sgd_weights are modified, - # only 2 rows change per iteration asgd_weights = (1 - asgd_step_size) * asgd_weights \ + asgd_step_size * sgd_weights asgd_bias = (1 - asgd_step_size) * asgd_bias \ @@ -450,15 +447,173 @@ def partial_fit(self, X, y): return self + def cost(self, X, y): + weights = self.asgd_weights + bias = self.asgd_bias + decisions = dot(np.asarray(X), weights) + bias + margins = [] + for decision, label in zip(decisions, y): + dsrt = np.argsort(decision) + distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1] + margin = decision[label] - decision[distractor] + margins.append(margin) + + flat_weights = weights.flatten() + l2_term = np.dot(flat_weights, flat_weights) + l2_cost = self.l2_regularization * l2_term + rval = np.maximum(0, 1 - np.asarray(margins)).mean() + l2_cost + return rval + + +#import line_profiler +#profile = line_profiler.LineProfiler() +#import time + + +class SparseUpdateRankASGD(BaseMultiASGD): + """ + Implements rank-based multiclass SVM. + """ + + #@profile + def partial_fit(self, X, y): + if set(y) > set(range(self.n_classes)): + raise ValueError("Invalid 'y'") + #ttt = time.time() + + sgd_step_size0 = self.sgd_step_size0 + sgd_step_size = self.sgd_step_size + sgd_step_size_scheduling_exponent = \ + self.sgd_step_size_scheduling_exponent + sgd_step_size_scheduling_multiplier = \ + self.sgd_step_size_scheduling_multiplier + sgd_weights = np.asarray(self.sgd_weights, order='F') + sgd_bias = self.sgd_bias + + asgd_weights = np.asarray(self.asgd_weights, order='F') + asgd_bias = self.asgd_bias + asgd_step_size = self.asgd_step_size + + l2_regularization = self.l2_regularization + + n_observations = self.n_observations + n_classes = self.n_classes + + train_means = self.train_means + recent_train_costs = self.recent_train_costs + + # -- the logical sgd_weight matrix is sgd_weights_scale * sgd_weights + sgd_weights_scale = 1.0 + + use_asgd_scale = True + + if use_asgd_scale: + # -- the logical asgd_weights is stored as a row-wise linear + # -- combination of asgd_weights and sgd_weights + asgd_scale = np.ones((n_classes, 2), dtype=asgd_weights.dtype) + asgd_scale[:, 1] = 0 # -- originally there is no sgd contribution + + for obs, label in izip(X, y): + + # -- compute margin + decision = sgd_weights_scale * dot(obs, sgd_weights) + sgd_bias + + dsrt = np.argsort(decision) + distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1] + margin = decision[label] - decision[distractor] + + # -- update sgd + sgd_weights_scale *= 1 - l2_regularization * sgd_step_size + + if margin < 1: + # -- perform the delayed updates to the rows of asgd + + if use_asgd_scale: + # -- XXX: this should be a single BLAS call to axpy + asgd_weights[:, distractor] *= asgd_scale[distractor, 0] + asgd_weights[:, distractor] += \ + (asgd_scale[distractor, 1] * sgd_weights_scale) \ + * sgd_weights[:, distractor] + asgd_scale[distractor] = [1, 0] + + # -- XXX: this should be a single BLAS call to axpy + asgd_weights[:, label] *= asgd_scale[label, 0] + asgd_weights[:, label] += \ + (asgd_scale[label, 1] * sgd_weights_scale)\ + * sgd_weights[:, label] + asgd_scale[label] = [1, 0] + + winc = (sgd_step_size / sgd_weights_scale) * obs + sgd_weights[:, distractor] -= winc + sgd_weights[:, label] += winc + sgd_bias[distractor] -= sgd_step_size + sgd_bias[label] += sgd_step_size + recent_train_costs.append(1 - float(margin)) + else: + recent_train_costs.append(0.0) + + if use_asgd_scale: + # -- update asgd via scale variables + asgd_scale *= (1 - asgd_step_size) + asgd_scale[:, 1] += asgd_step_size + else: + asgd_weights = (1 - asgd_step_size) * asgd_weights \ + + asgd_step_size * sgd_weights + + asgd_bias = (1 - asgd_step_size) * asgd_bias \ + + asgd_step_size * sgd_bias + + # -- update step_sizes + n_observations += 1 + sgd_step_size_scheduling = (1 + sgd_step_size0 * n_observations * + sgd_step_size_scheduling_multiplier) + sgd_step_size = sgd_step_size0 / \ + (sgd_step_size_scheduling ** \ + sgd_step_size_scheduling_exponent) + asgd_step_size = 1. / n_observations + + if len(recent_train_costs) == self.fit_n_partial: + if use_asgd_scale: + asgd_weights[:] *= asgd_scale[:, 0] + asgd_weights[:] += (sgd_weights_scale * asgd_scale[:, 1]) * sgd_weights + asgd_scale[:, 0] = 1 + asgd_scale[:, 1] = 0 + + flat_weights = asgd_weights.flatten() + l2_term = np.dot(flat_weights, flat_weights) + train_means.append(np.mean(recent_train_costs) + + l2_regularization * l2_term) + self.recent_train_costs = recent_train_costs = [] + + # -- + self.sgd_weights = sgd_weights * sgd_weights_scale + self.sgd_bias = sgd_bias + self.sgd_step_size = sgd_step_size + + if use_asgd_scale: + asgd_weights[:] *= asgd_scale[:, 0] + asgd_weights[:] += (sgd_weights_scale * asgd_scale[:, 1]) * sgd_weights + asgd_scale[:, 0] = 1 + asgd_scale[:, 1] = 0 + + self.asgd_weights = asgd_weights + self.asgd_bias = asgd_bias + self.asgd_step_size = asgd_step_size + + self.n_observations = n_observations + + #profile.print_stats() + #print 'n_obs', n_observations, 'time/%i' % len(X), (time.time() - ttt) + return self def cost(self, X, y): weights = self.asgd_weights bias = self.asgd_bias - decisions = dot(X, weights) + bias + decisions = dot(np.asarray(X), weights) + bias margins = [] for decision, label in zip(decisions, y): dsrt = np.argsort(decision) - distractor = dsrt[-2] if dsrt[-1] is label else dsrt[-1] + distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1] margin = decision[label] - decision[distractor] margins.append(margin) @@ -467,3 +622,4 @@ def cost(self, X, y): l2_cost = self.l2_regularization * l2_term rval = np.maximum(0, 1 - np.asarray(margins)).mean() + l2_cost return rval + From 0cc4515e709cd13e945eeef1348fa2498bef48ab Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 20 Mar 2012 16:54:07 -0400 Subject: [PATCH 15/18] adding linsvm --- asgd/linsvm.py | 143 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 asgd/linsvm.py diff --git a/asgd/linsvm.py b/asgd/linsvm.py new file mode 100644 index 0000000..9e343b9 --- /dev/null +++ b/asgd/linsvm.py @@ -0,0 +1,143 @@ +import numpy as np + +from .auto_step_size import binary_fit +from .naive_asgd import NaiveBinaryASGD +from .naive_asgd import NaiveRankASGD +from .naive_asgd import SparseUpdateRankASGD + +try: + import sklearn.svm +except ImportError: + pass + + +class LinearSVM(object): + """ + SVM-fitting object that implements a heuristic for choosing + the right solver among several that may be installed in sklearn, and asgd. + + """ + + def __init__(self, l2_regularization, solver='auto', label_dct=None, + label_weights=None): + self.l2_regularization = l2_regularization + self.solver = solver + self.label_dct = label_dct + self.label_weights = label_weights + + def fit(self, X, y, weights=None, bias=None): + solver = self.solver + label_dct = self.label_dct + l2_regularization = self.l2_regularization + + if weights or bias: + raise NotImplementedError( + 'Currently only train_set = (X, y) is supported') + del weights, bias + if self.label_weights: + raise NotImplementedError() + + n_train, n_feats = X.shape + if self.label_dct is None: + label_dct = dict([(v, i) for (i, v) in enumerate(sorted(set(y)))]) + else: + label_dct = self.label_dct + n_classes = len(label_dct) + + if n_classes < 2: + raise ValueError('must be at least 2 labels') + + elif n_classes == 2: + if set(y) != set([-1, 1]): + # TODO: use the label_dct to automatically adjust + raise NotImplementedError() + + if solver == 'auto': + if n_feats > n_train: + solver = ('sklearn.svm.SVC', {'kernel': 'precomputed'}) + else: + solver = ('asgd.NaiveBinaryASGD', {}) + + method, method_kwargs = solver + + if method == 'asgd.NaiveBinaryASGD': + method_kwargs = dict(method_kwargs) + method_kwargs.setdefault('rstate', np.random.RandomState(123)) + svm = NaiveBinaryASGD( + l2_regularization=l2_regularization, + **method_kwargs) + svm = binary_fit(svm, (X, y)) + + elif method == 'sklearn.svm.SVC': + C = 1.0 / (l2_regularization * len(X)) + svm = sklearn.svm.SVC(C=C, scale_C=False, **method_kwargs) + raise NotImplementedError( + 'save ktrn, multiply Xtst by X in predict()') + ktrn = linear_kernel(X, X) + svm.fit(ktrn, y) + + else: + raise ValueError('unrecognized method', method) + + else: # n_classes > 2 + if set(y) != set(range(len(set(y)))): + # TODO: use the label_dct to automatically adjust + raise NotImplementedError('labels need adapting', + set(y)) + if solver == 'auto': + # TODO: switch to OVA and use libSVM? + #if n_feats > n_train: + #solver = ('asgd.NaiveRankASGD', { }) + solver = ('asgd.SparseUpdateRankASGD', {}) + + method, method_kwargs = solver + + if method == 'asgd.NaiveRankASGD': + method_kwargs = dict(method_kwargs) + method_kwargs.setdefault('rstate', np.random.RandomState(123)) + svm = NaiveRankASGD(n_classes, n_feats, + l2_regularization=l2_regularization, + **method_kwargs) + svm = binary_fit(svm, (X, y)) + + elif method == 'asgd.SparseUpdateRankASGD': + method_kwargs = dict(method_kwargs) + method_kwargs.setdefault('rstate', np.random.RandomState(123)) + svm = SparseUpdateRankASGD(n_classes, n_feats, + l2_regularization=l2_regularization, + **method_kwargs) + svm = binary_fit(svm, (X, y)) + + elif method == 'asgd.NaiveOVAASGD': + # -- one vs. all + method_kwargs = dict(method_kwargs) + method_kwargs.setdefault('rstate', np.random.RandomState(123)) + svm = NaiveOVAASGD(n_classes, n_feats, + l2_regularization=l2_regularization, + **method_kwargs) + svm = binary_fit(svm, (X, y)) + + elif method == 'sklearn.svm.SVC': + # -- one vs. one + raise NotImplementedError(method) + + elif method == 'sklearn.svm.NuSVC': + # -- one vs. one + raise NotImplementedError(method) + + elif method == 'sklearn.svm.LinearSVC': + # -- one vs. all + raise NotImplementedError(method) + + else: + raise ValueError('unrecognized method', method) + + self.svm = svm + + def predict(self, *args, **kwargs): + return self.svm.predict(*args, **kwargs) + + def decision_function(self, *args, **kwargs): + return self.svm.decision_function(*args, **kwargs) + + From cd9da187705890ab51062cde0b8dc1501dacf2c2 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 20 Mar 2012 16:58:00 -0400 Subject: [PATCH 16/18] comment for linsvm --- asgd/linsvm.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/asgd/linsvm.py b/asgd/linsvm.py index 9e343b9..8efde54 100644 --- a/asgd/linsvm.py +++ b/asgd/linsvm.py @@ -1,3 +1,8 @@ +""" +Automatic heuristic solver selection: LinearSVM + +""" + import numpy as np from .auto_step_size import binary_fit From 104bccf4b4e54639b8dbc819bf80bef3b3d05104 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 20 Mar 2012 17:15:23 -0400 Subject: [PATCH 17/18] adding LinearSVM to toplevel --- asgd/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/asgd/__init__.py b/asgd/__init__.py index b9a13ea..4501d95 100644 --- a/asgd/__init__.py +++ b/asgd/__init__.py @@ -4,3 +4,5 @@ from naive_asgd import SparseUpdateRankASGD from experimental_asgd import ExperimentalBinaryASGD + +from linsvm import LinearSVM From c1e0a3aa16c9391616ba86a1f28e9a20574d70b9 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Thu, 22 Mar 2012 09:27:05 -0400 Subject: [PATCH 18/18] fixing tests and changing early_stop minimization method --- asgd/auto_step_size.py | 51 ++++++++++++++++++++----------- asgd/linsvm.py | 17 +++++++---- asgd/naive_asgd.py | 15 +++++++-- asgd/tests/test_auto_step_size.py | 41 +++++++++---------------- asgd/tests/test_naive_asgd.py | 4 +-- 5 files changed, 73 insertions(+), 55 deletions(-) diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py index f7d804e..15a0dd9 100644 --- a/asgd/auto_step_size.py +++ b/asgd/auto_step_size.py @@ -30,7 +30,7 @@ def find_sgd_step_size0( ------- Optimal sgd_step_size0 given `X` and `y`. """ - # -- stupid scipy calls some sizes twice!? + # -- stupid solver calls some sizes twice!? _cache = {} def eval_size0(log2_size0): @@ -52,18 +52,37 @@ def eval_size0(log2_size0): rval = other.cost(X, y) if np.isnan(rval): rval = float('inf') - #print 'find step %e: %e' % (current_step_size, rval) + logger.info('find step %e: %e' % (current_step_size, rval)) + _cache[float(log2_size0)] = rval return rval - log2_best_sgd_step_size0 = scipy.optimize.fmin( - eval_size0, - x0=np.log2(model.sgd_step_size0), - xtol=tolerance, - ) - - #print 'Brent came back with', log2_best_sgd_step_size0, 2 ** log2_best_sgd_step_size0 - - rval = 2.0 ** log2_best_sgd_step_size0 + if tolerance < 0.5: + raise NotImplementedError( + 'tolerance too small, need adaptive stepsize') + + step = -tolerance + x0 = np.log2(model.sgd_step_size0) + x1 = np.log2(model.sgd_step_size0) + step + y0 = eval_size0(x0) + y1 = eval_size0(x1) + if y1 > y0: + step *= -1 + y0, y1 = y1, y0 + + while y1 < y0: + x0, y0 = x1, y1 + x1 = x0 + step + y1 = eval_size0(x1) + + # I tried using sp.optimize.fmin, but this function is bumpy and we only + # want a really coarse estimate of the optimmum, so fmin and fmin_powell + # end up being relatively inefficient even compared to this really stupid + # search. + # + # TODO: increase the stepsize every time it still goes down, and then + # backtrack when we over-step + + rval = 2.0 ** x0 return rval @@ -77,7 +96,7 @@ def binary_fit( Parameters ---------- - model: BinaryASGD + model: BaseASGD instance Instance of the model to be fitted. fit_args - tuple of args to model.fit @@ -90,12 +109,11 @@ def binary_fit( Returns ------- - model: BinaryASGD - Instances of the model, fitted with an estimate of the best - sgd_step_size0 + model: model, fitted with an estimate of the best sgd_step_size0 """ assert max_examples > 0 + logger.info('binary_fit: design matrix shape %s' % str(fit_args[0].shape)) # randomly choose up to max_examples uniformly without replacement from # across the whole set of training data. @@ -111,8 +129,7 @@ def binary_fit( # Heuristic: take the best stepsize according to the first max_examples, # and go half that fast for the full run. - stepdown = np.sqrt( float(len(all_idxs)) / float(len(idxs))) - step_size0 = max(best / stepdown, step_size_floor) + step_size0 = max(best / 2.0, step_size_floor) logger.info('setting sgd_step_size: %e' % step_size0) model.sgd_step_size0 = float(step_size0) diff --git a/asgd/linsvm.py b/asgd/linsvm.py index 8efde54..f06c15a 100644 --- a/asgd/linsvm.py +++ b/asgd/linsvm.py @@ -90,10 +90,9 @@ def fit(self, X, y, weights=None, bias=None): raise NotImplementedError('labels need adapting', set(y)) if solver == 'auto': - # TODO: switch to OVA and use libSVM? - #if n_feats > n_train: - #solver = ('asgd.NaiveRankASGD', { }) - solver = ('asgd.SparseUpdateRankASGD', {}) + solver = ('asgd.SparseUpdateRankASGD', { + 'sgd_step_size0': 10.0 / X.shape[1], + }) method, method_kwargs = solver @@ -131,8 +130,14 @@ def fit(self, X, y, weights=None, bias=None): raise NotImplementedError(method) elif method == 'sklearn.svm.LinearSVC': - # -- one vs. all - raise NotImplementedError(method) + # -- Crammer & Singer multi-class + C = 1.0 / (l2_regularization * len(X)) + svm = sklearn.svm.LinearSVC( + C=C, + scale_C=False, + multi_class=True, + **method_kwargs) + svm.fit(X, y) else: raise ValueError('unrecognized method', method) diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py index b73a671..6fd0156 100644 --- a/asgd/naive_asgd.py +++ b/asgd/naive_asgd.py @@ -165,7 +165,8 @@ def cost(self, X, y): bias = self.asgd_bias margin = y * (np.dot(X, weights) + bias) l2_cost = self.l2_regularization * (weights ** 2).sum() - rval = np.maximum(0, 1 - margin).mean() + l2_cost + hinge_loss = np.maximum(0, 1 - margin) + rval = hinge_loss.mean() + l2_cost return rval @@ -311,7 +312,7 @@ def partial_fit(self, X, y): violations = margin < 1 - if violations: + if np.any(violations): label_violated = label[violations] sgd_weights[:, violations] += ( sgd_step_size @@ -475,6 +476,11 @@ class SparseUpdateRankASGD(BaseMultiASGD): Implements rank-based multiclass SVM. """ + #XXX + #XXX + # TODO: implement l2-SVM here by updating + # sgd rows in proportion to hinge loss + #@profile def partial_fit(self, X, y): if set(y) > set(range(self.n_classes)): @@ -570,7 +576,10 @@ def partial_fit(self, X, y): sgd_step_size = sgd_step_size0 / \ (sgd_step_size_scheduling ** \ sgd_step_size_scheduling_exponent) - asgd_step_size = 1. / n_observations + if n_observations <= self.fit_n_partial: + asgd_step_size = 1. + else: + asgd_step_size = 1. / (n_observations - self.fit_n_partial) if len(recent_train_costs) == self.fit_n_partial: if use_asgd_scale: diff --git a/asgd/tests/test_auto_step_size.py b/asgd/tests/test_auto_step_size.py index 38e4d1a..1f3c672 100644 --- a/asgd/tests/test_auto_step_size.py +++ b/asgd/tests/test_auto_step_size.py @@ -1,4 +1,5 @@ from nose.tools import assert_equal +import numpy as np from numpy.testing import assert_almost_equal from numpy.testing import assert_array_equal @@ -13,7 +14,7 @@ def get_new_model(n_features, rstate, n_points): return BinaryASGD(n_features, rstate=rstate, - sgd_step_size0=1e3, + sgd_step_size0=1e3, # intentionally large, binary_fit will fix l2_regularization=1e-3, max_observations=5 * n_points, min_observations=5 * n_points, @@ -27,13 +28,9 @@ def test_binary_sgd_step_size0(): X, y = get_fake_data(100, n_features, rstate) clf = get_new_model(n_features, rstate, 100) - best0 = find_sgd_step_size0(clf, (X, y), (.25, .5), tolerance=1e-6) - - # start a little lower, still works - best1 = find_sgd_step_size0(clf, (X, y), (.125, .25), tolerance=1e-6) - print best0, best1 - assert_almost_equal(best0, 0.03501, decimal=4) - assert_almost_equal(best1, 0.03501, decimal=4) + best0 = find_sgd_step_size0(clf, (X, y)) + print best0 + assert np.allclose(best0, 0.04, atol=.1, rtol=.5) # find_sgd_step_size0 does not change clf assert clf.sgd_step_size0 == 1000.0 @@ -43,25 +40,15 @@ def test_binary_fit(): rstate = RandomState(42) n_features = 20 - clf100 = get_new_model(n_features, rstate, 100) - X, y = get_fake_data(100, n_features, rstate) - _clf100 = binary_fit(clf100, (X, y)) - assert _clf100 is clf100 - assert_almost_equal(clf100.sgd_step_size0, 0.00954, decimal=4) - - # smoke test - clf1000 = get_new_model(n_features, rstate, 1000) - X, y = get_fake_data(DEFAULT_MAX_EXAMPLES, n_features, rstate) - _clf1000 = binary_fit(clf1000, (X, y)) - assert _clf1000 is clf1000 - assert_almost_equal(clf1000.sgd_step_size0, 0.00201, decimal=4) - - # smoke test that at least it runs - clf2000 = get_new_model(n_features, rstate, 2000) - X, y = get_fake_data(2000, n_features, rstate) - _clf2000 = binary_fit(clf2000, (X, y)) - assert _clf2000 == clf2000 - assert_almost_equal(clf2000.sgd_step_size0, 0.001498, decimal=4) + for L in [100, DEFAULT_MAX_EXAMPLES, int(DEFAULT_MAX_EXAMPLES * 1.5), + int(DEFAULT_MAX_EXAMPLES * 3)]: + + clf = get_new_model(n_features, rstate, L) + X, y = get_fake_data(L, n_features, rstate, separation=0.1) + best = find_sgd_step_size0(clf, (X, y)) + _clf = binary_fit(clf, (X, y)) + assert _clf is clf + assert 0 < clf.sgd_step_size0 <= best def test_fit_replicable(): diff --git a/asgd/tests/test_naive_asgd.py b/asgd/tests/test_naive_asgd.py index 668c073..6826373 100644 --- a/asgd/tests/test_naive_asgd.py +++ b/asgd/tests/test_naive_asgd.py @@ -22,10 +22,10 @@ dtype=np.float32) -def get_fake_data(n_points, n_features, rstate): +def get_fake_data(n_points, n_features, rstate, separation=1e-1): X = rstate.randn(n_points, n_features).astype(np.float32) y = 2 * (rstate.randn(n_points) > 0) - 1 - X[y == 1] += 1e-1 + X[y == 1] += separation return X, y