From ce0f064006638b6b1d5f9f328a08f79600f3fd29 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 31 Jan 2012 16:04:13 -0500
Subject: [PATCH 01/18] wip

---
 asgd/naive_asgd.py | 72 +++++++++++++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 26 deletions(-)

diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py
index 0396344..1eda265 100644
--- a/asgd/naive_asgd.py
+++ b/asgd/naive_asgd.py
@@ -9,26 +9,33 @@
 
 DEFAULT_SGD_STEP_SIZE0 = 1e-2
 DEFAULT_L2_REGULARIZATION = 1e-3
-DEFAULT_N_ITERATIONS = 10
+DEFAULT_MIN_OBSERVATIONS = 1000
+DEFAULT_MAX_OBSERVATIONS = None
 DEFAULT_FEEDBACK = False
 DEFAULT_RSTATE = None
 DEFAULT_DTYPE = np.float32
 
+DEFAULT_N_ITERATIONS = 10   # -- used for OVA
+
 
 class BaseASGD(object):
 
     def __init__(self, n_features,
                  sgd_step_size0=DEFAULT_SGD_STEP_SIZE0,
                  l2_regularization=DEFAULT_L2_REGULARIZATION,
-                 n_iterations=DEFAULT_N_ITERATIONS, feedback=DEFAULT_FEEDBACK,
-                 rstate=DEFAULT_RSTATE, dtype=DEFAULT_DTYPE):
+                 min_observations=DEFAULT_MIN_OBSERVATIONS,
+                 max_observations=DEFAULT_MAX_OBSERVATIONS,
+                 feedback=DEFAULT_FEEDBACK,
+                 rstate=DEFAULT_RSTATE,
+                 dtype=DEFAULT_DTYPE):
 
         # --
         assert n_features > 1
         self.n_features = n_features
 
-        assert n_iterations > 0
-        self.n_iterations = n_iterations
+        assert 0 <= min_observations <= max_observations
+        self.min_observations = min_observations
+        self.max_observations = max_observations
 
         if feedback:
             raise NotImplementedError("FIXME: feedback support is buggy")
@@ -51,33 +58,40 @@ def __init__(self, n_features,
         self.asgd_step_size0 = 1
         self.asgd_step_size = self.asgd_step_size0
 
+        # --
         self.n_observations = 0
+        self.train_means = []
+        self.recent_train_costs = []
 
+    def fit_converged(self):
+        train_means = self.train_means
+        if len(train_means) > 1:
+            midpt = len(train_means) // 2
+            if train_means[-1] > .99 * train_means[midpt]:
+                return True
+        return False
 
-class NaiveBinaryASGD(BaseASGD):
 
-    def __init__(self, n_features, sgd_step_size0=DEFAULT_SGD_STEP_SIZE0,
-                 l2_regularization=DEFAULT_L2_REGULARIZATION,
-                 n_iterations=DEFAULT_N_ITERATIONS, feedback=DEFAULT_FEEDBACK,
-                 rstate=DEFAULT_RSTATE, dtype=DEFAULT_DTYPE):
+class BaseBinaryASGD(BaseASGD):
 
-        super(NaiveBinaryASGD, self).__init__(
-            n_features,
-            sgd_step_size0=sgd_step_size0,
-            l2_regularization=l2_regularization,
-            n_iterations=n_iterations,
-            feedback=feedback,
-            rstate=rstate,
-            dtype=dtype,
-            )
+    def __init__(self, *args, **kwargs):
+        BaseASGD.__init__(self, *args, **kwargs)
 
-        # --
         self.sgd_weights = np.zeros((n_features), dtype=dtype)
         self.sgd_bias = np.zeros((1), dtype=dtype)
 
         self.asgd_weights = np.zeros((n_features), dtype=dtype)
         self.asgd_bias = np.zeros((1), dtype=dtype)
 
+
+    def decision_function(self, X):
+        return dot(self.asgd_weights, X.T) + self.asgd_bias
+
+    def predict(self, X):
+        return np.sign(self.decision_function(X))
+
+class NaiveBinaryASGD(BaseBinaryASGD):
+
     def partial_fit(self, X, y):
 
         sgd_step_size0 = self.sgd_step_size0
@@ -96,6 +110,9 @@ def partial_fit(self, X, y):
         l2_regularization = self.l2_regularization
 
         n_observations = self.n_observations
+        train_means = self.train_means
+        recent_train_costs = self.recent_train_costs
+        min_observations = self.min_observations
 
         for obs, label in izip(X, y):
 
@@ -110,6 +127,9 @@ def partial_fit(self, X, y):
 
                 sgd_weights += sgd_step_size * label * obs
                 sgd_bias += sgd_step_size * label
+                recent_train_costs.append(1 - float(margin))
+            else:
+                recent_train_costs.append(0)
 
             # -- update asgd
             asgd_weights = (1 - asgd_step_size) * asgd_weights \
@@ -126,6 +146,12 @@ def partial_fit(self, X, y):
                      sgd_step_size_scheduling_exponent)
             asgd_step_size = 1. / n_observations
 
+            if len(recent_train_costs) == min_observations:
+                train_means.append(np.mean(recent_train_costs)
+                        + l2_regularization * np.dot(
+                            self.asgd_weights, self.asgd_weights))
+                recent_train_costs = []
+
         # --
         self.sgd_weights = sgd_weights
         self.sgd_bias = sgd_bias
@@ -163,12 +189,6 @@ def fit(self, X, y):
 
         return self
 
-    def decision_function(self, X):
-        return dot(self.asgd_weights, X.T) + self.asgd_bias
-
-    def predict(self, X):
-        return np.sign(self.decision_function(X))
-
 
 class NaiveOVAASGD(BaseASGD):
 

From dcb1a11eb5ef2061c24670ef1d49b2d1fa79f82d Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 31 Jan 2012 17:59:12 -0500
Subject: [PATCH 02/18] writing early stopping better and with tests

---
 asgd/naive_asgd.py                | 81 ++++++++++++++++++++++---------
 asgd/tests/test_auto_step_size.py | 18 ++++---
 asgd/tests/test_naive_asgd.py     | 51 ++++++++++++++++---
 3 files changed, 110 insertions(+), 40 deletions(-)

diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py
index 1eda265..8a3bec2 100644
--- a/asgd/naive_asgd.py
+++ b/asgd/naive_asgd.py
@@ -3,6 +3,7 @@
 naive, non-optimized implementation
 """
 
+import sys
 import numpy as np
 from numpy import dot
 from itertools import izip
@@ -10,12 +11,12 @@
 DEFAULT_SGD_STEP_SIZE0 = 1e-2
 DEFAULT_L2_REGULARIZATION = 1e-3
 DEFAULT_MIN_OBSERVATIONS = 1000
-DEFAULT_MAX_OBSERVATIONS = None
+DEFAULT_MAX_OBSERVATIONS = sys.maxint
 DEFAULT_FEEDBACK = False
 DEFAULT_RSTATE = None
 DEFAULT_DTYPE = np.float32
-
-DEFAULT_N_ITERATIONS = 10   # -- used for OVA
+DEFAULT_N_PARTIAL = 1000
+DEFAULT_FIT_TOLERANCE = 1e-2
 
 
 class BaseASGD(object):
@@ -25,6 +26,8 @@ def __init__(self, n_features,
                  l2_regularization=DEFAULT_L2_REGULARIZATION,
                  min_observations=DEFAULT_MIN_OBSERVATIONS,
                  max_observations=DEFAULT_MAX_OBSERVATIONS,
+                 fit_n_partial=DEFAULT_N_PARTIAL,
+                 fit_tolerance=DEFAULT_FIT_TOLERANCE,
                  feedback=DEFAULT_FEEDBACK,
                  rstate=DEFAULT_RSTATE,
                  dtype=DEFAULT_DTYPE):
@@ -33,10 +36,15 @@ def __init__(self, n_features,
         assert n_features > 1
         self.n_features = n_features
 
-        assert 0 <= min_observations <= max_observations
+        if not 0 <= min_observations <= max_observations:
+            raise ValueError('min_observations > max_observations',
+                    (min_observations, max_observations))
         self.min_observations = min_observations
         self.max_observations = max_observations
 
+        self.fit_n_partial = fit_n_partial
+        self.fit_tolerance = fit_tolerance
+
         if feedback:
             raise NotImplementedError("FIXME: feedback support is buggy")
         self.feedback = feedback
@@ -65,10 +73,10 @@ def __init__(self, n_features,
 
     def fit_converged(self):
         train_means = self.train_means
-        if len(train_means) > 1:
+        if len(train_means) > 2:
             midpt = len(train_means) // 2
-            if train_means[-1] > .99 * train_means[midpt]:
-                return True
+            thresh = (1 - self.fit_tolerance) * train_means[midpt]
+            return train_means[-1] > thresh
         return False
 
 
@@ -77,19 +85,20 @@ class BaseBinaryASGD(BaseASGD):
     def __init__(self, *args, **kwargs):
         BaseASGD.__init__(self, *args, **kwargs)
 
-        self.sgd_weights = np.zeros((n_features), dtype=dtype)
-        self.sgd_bias = np.zeros((1), dtype=dtype)
-
-        self.asgd_weights = np.zeros((n_features), dtype=dtype)
-        self.asgd_bias = np.zeros((1), dtype=dtype)
+        self.sgd_weights = np.zeros((self.n_features,), dtype=self.dtype)
+        self.sgd_bias = np.asarray(0, dtype=self.dtype)
 
+        self.asgd_weights = np.zeros((self.n_features,), dtype=self.dtype)
+        self.asgd_bias = np.asarray(0, dtype=self.dtype)
 
     def decision_function(self, X):
+        X = np.asarray(X)
         return dot(self.asgd_weights, X.T) + self.asgd_bias
 
     def predict(self, X):
         return np.sign(self.decision_function(X))
 
+
 class NaiveBinaryASGD(BaseBinaryASGD):
 
     def partial_fit(self, X, y):
@@ -146,11 +155,11 @@ def partial_fit(self, X, y):
                      sgd_step_size_scheduling_exponent)
             asgd_step_size = 1. / n_observations
 
-            if len(recent_train_costs) == min_observations:
+            if len(recent_train_costs) == self.fit_n_partial:
                 train_means.append(np.mean(recent_train_costs)
                         + l2_regularization * np.dot(
                             self.asgd_weights, self.asgd_weights))
-                recent_train_costs = []
+                self.recent_train_costs = recent_train_costs = []
 
         # --
         self.sgd_weights = sgd_weights
@@ -174,19 +183,31 @@ def fit(self, X, y):
         assert n_features == self.n_features
         assert n_points == y.size
 
-        n_iterations = self.n_iterations
+        n_points_remaining = self.max_observations - self.n_observations
+
+        while n_points_remaining > 0:
 
-        for i in xrange(n_iterations):
+            # -- every iteration will train from n_partial observations and
+            # then check for convergence
+            fit_n_partial = min(n_points_remaining, self.fit_n_partial)
 
             idx = self.rstate.permutation(n_points)
-            Xb = X[idx]
-            yb = y[idx]
+            Xb = X[idx[:fit_n_partial]]
+            yb = y[idx[:fit_n_partial]]
             self.partial_fit(Xb, yb)
 
             if self.feedback:
+                raise NotImplementedError(
+                    'partial_fit logic requires memory to be distinct')
                 self.sgd_weights = self.asgd_weights
                 self.sgd_bias = self.asgd_bias
 
+            if (self.n_observations >= self.min_observations
+                    and self.fit_converged()):
+                break
+
+            n_points_remaining -= len(Xb)
+
         return self
 
 
@@ -195,7 +216,10 @@ class NaiveOVAASGD(BaseASGD):
     def __init__(self, n_classes, n_features,
                  sgd_step_size0=DEFAULT_SGD_STEP_SIZE0,
                  l2_regularization=DEFAULT_L2_REGULARIZATION,
-                 n_iterations=DEFAULT_N_ITERATIONS,
+                 min_observations=DEFAULT_MIN_OBSERVATIONS,
+                 max_observations=DEFAULT_MAX_OBSERVATIONS,
+                 fit_n_partial=DEFAULT_N_PARTIAL,
+                 fit_tolerance=DEFAULT_FIT_TOLERANCE,
                  feedback=DEFAULT_FEEDBACK,
                  rstate=DEFAULT_RSTATE,
                  dtype=DEFAULT_DTYPE):
@@ -204,7 +228,10 @@ def __init__(self, n_classes, n_features,
             n_features,
             sgd_step_size0=sgd_step_size0,
             l2_regularization=l2_regularization,
-            n_iterations=n_iterations,
+            min_observations=min_observations,
+            max_observations=max_observations,
+            fit_n_partial=fit_n_partial,
+            fit_tolerance=fit_tolerance,
             feedback=feedback,
             rstate=rstate,
             dtype=dtype,
@@ -299,19 +326,25 @@ def fit(self, X, y):
         assert n_features == self.n_features
         assert n_points == y.size
 
-        n_iterations = self.n_iterations
 
-        for i in xrange(n_iterations):
+        n_points_remaining = self.max_observations - self.n_observations
+
+        while n_points_remaining > 0:
+            # -- every iteration will train from n_partial observations and
+            # then check for convergence
+            fit_n_partial = min(n_points_remaining, self.fit_n_partial)
 
             idx = self.rstate.permutation(n_points)
-            Xb = X[idx]
-            yb = y[idx]
+            Xb = X[idx[:fit_n_partial]]
+            yb = y[idx[:fit_n_partial]]
             self.partial_fit(Xb, yb)
 
             if self.feedback:
                 self.sgd_weights = self.asgd_weights
                 self.sgd_bias = self.asgd_bias
 
+            n_points_remaining -= len(Xb)
+
         return self
 
     def decision_function(self, X):
diff --git a/asgd/tests/test_auto_step_size.py b/asgd/tests/test_auto_step_size.py
index 89799e1..5090837 100644
--- a/asgd/tests/test_auto_step_size.py
+++ b/asgd/tests/test_auto_step_size.py
@@ -11,11 +11,13 @@
 from test_naive_asgd import get_fake_data
 
 
-def get_new_model(n_features, rstate):
+def get_new_model(n_features, rstate, n_points):
     return BinaryASGD(n_features, rstate=rstate,
             sgd_step_size0=1e3,
             l2_regularization=1e-3,
-            n_iterations=5)
+            max_observations=5 * n_points,
+            min_observations=5 * n_points,
+            )
 
 
 def test_binary_sgd_step_size0():
@@ -24,7 +26,7 @@ def test_binary_sgd_step_size0():
 
     X, y = get_fake_data(100, n_features, rstate)
 
-    clf = get_new_model(n_features, rstate)
+    clf = get_new_model(n_features, rstate, 100)
     best = find_sgd_step_size0(clf, X, y, (.25, .5))
     assert_almost_equal(best, -4.9927, decimal=4)
 
@@ -40,21 +42,21 @@ def test_binary_fit():
     rstate = RandomState(42)
     n_features = 20
 
-    clf100 = get_new_model(n_features, rstate)
+    clf100 = get_new_model(n_features, rstate, 100)
     X, y = get_fake_data(100, n_features, rstate)
     _clf100 = binary_fit(clf100, X, y)
     assert _clf100 is clf100
     assert_almost_equal(clf100.sgd_step_size0, 0.04812, decimal=4)
 
     # smoke test
-    clf1000 = get_new_model(n_features, rstate)
+    clf1000 = get_new_model(n_features, rstate, 1000)
     X, y = get_fake_data(DEFAULT_MAX_EXAMPLES, n_features, rstate)
     _clf1000 = binary_fit(clf1000, X, y)
     assert _clf1000 is clf1000
     assert_almost_equal(clf1000.sgd_step_size0, 0.0047, decimal=4)
 
     # smoke test that at least it runs
-    clf2000 = get_new_model(n_features, rstate)
+    clf2000 = get_new_model(n_features, rstate, 2000)
     X, y = get_fake_data(2000, n_features, rstate)
     _clf2000 = binary_fit(clf2000, X, y)
     assert _clf2000 == clf2000
@@ -67,10 +69,10 @@ def test_fit_replicable():
 
     X, y = get_fake_data(100, n_features, RandomState(4))
 
-    m0 = get_new_model(n_features, RandomState(45))
+    m0 = get_new_model(n_features, RandomState(45), 100)
     m0 = binary_fit(m0, X, y)
 
-    m1 = get_new_model(n_features, RandomState(45))
+    m1 = get_new_model(n_features, RandomState(45), 100)
     m1 = binary_fit(m1, X, y)
 
     assert_array_equal(m0.sgd_weights, m1.sgd_weights)
diff --git a/asgd/tests/test_naive_asgd.py b/asgd/tests/test_naive_asgd.py
index 0900f87..668c073 100644
--- a/asgd/tests/test_naive_asgd.py
+++ b/asgd/tests/test_naive_asgd.py
@@ -1,3 +1,4 @@
+import sys
 from nose.tools import assert_equal, raises
 from nose.plugins.skip import SkipTest
 from numpy.testing import assert_allclose
@@ -16,7 +17,8 @@
 DEFAULT_ARGS = (N_FEATURES,)
 DEFAULT_KWARGS = dict(sgd_step_size0=1e-3,
                       l2_regularization=1e-6,
-                      n_iterations=4,
+                      min_observations=4 * N_POINTS,
+                      max_observations=4 * N_POINTS,
                       dtype=np.float32)
 
 
@@ -41,7 +43,6 @@ def get_fake_multiclass_data(n_points, n_features, n_classes, rstate):
 
 
 def test_naive_asgd():
-
     rstate = RandomState(42)
 
     X, y = get_fake_data(N_POINTS, N_FEATURES, rstate)
@@ -50,6 +51,7 @@ def test_naive_asgd():
     clf = BinaryASGD(*DEFAULT_ARGS, rstate=rstate, **DEFAULT_KWARGS)
 
     clf.fit(X, y)
+    assert clf.n_observations == clf.min_observations == clf.max_observations
     ytrn_preds = clf.predict(X)
     ytst_preds = clf.predict(Xtst)
     ytrn_acc = (ytrn_preds == y).mean()
@@ -78,7 +80,6 @@ def test_naive_asgd_with_feedback():
 
 
 def test_naive_asgd_multi_labels():
-
     rstate = RandomState(44)
 
     X, y = get_fake_binary_data_multi_labels(N_POINTS, N_FEATURES, rstate)
@@ -87,7 +88,10 @@ def test_naive_asgd_multi_labels():
 
     # n_classes is 2 since it is actually a binary case
     clf = OVAASGD(2, N_FEATURES, sgd_step_size0=1e-3, l2_regularization=1e-6,
-                  n_iterations=4, dtype=np.float32, rstate=rstate)
+                  min_observations=4 * N_POINTS,
+                  max_observations=4 * N_POINTS,
+                  dtype=np.float32,
+                  rstate=rstate)
     clf.fit(X, y)
     ytrn_preds = clf.predict(X)
     ytst_preds = clf.predict(Xtst)
@@ -98,7 +102,6 @@ def test_naive_asgd_multi_labels():
 
 
 def test_naive_multiclass_ova_asgd():
-
     rstate = RandomState(45)
 
     n_classes = 10
@@ -108,7 +111,10 @@ def test_naive_multiclass_ova_asgd():
                                           rstate)
 
     clf = OVAASGD(n_classes, N_FEATURES, sgd_step_size0=1e-3,
-                  l2_regularization=1e-6, n_iterations=4, dtype=np.float32,
+                  l2_regularization=1e-6,
+                  min_observations=4 * N_POINTS,
+                  max_observations=4 * N_POINTS,
+                  dtype=np.float32,
                   rstate=rstate)
 
     clf.fit(X, y)
@@ -123,7 +129,6 @@ def test_naive_multiclass_ova_asgd():
 
 
 def test_naive_multiclass_ova_vs_binary_asgd():
-
     rstate = RandomState(42)
 
     n_classes = 3
@@ -162,7 +167,6 @@ def test_naive_multiclass_ova_vs_binary_asgd():
 
 @raises(ValueError)
 def test_naive_ova_asgd_wrong_labels():
-
     rstate = RandomState(42)
 
     n_classes = 10
@@ -174,3 +178,34 @@ def test_naive_ova_asgd_wrong_labels():
                   rstate=RandomState(999), **DEFAULT_KWARGS)
     ytrn_bad = rstate.randint(n_classes + 42, size=len(ytrn))
     clf.partial_fit(Xtrn, ytrn_bad)
+
+
+def test_early_stopping():
+    rstate = RandomState(42)
+
+    for N_POINTS in (1000, 2000):
+
+        X, y = get_fake_data(N_POINTS, N_FEATURES, rstate)
+        Xtst, ytst = get_fake_data(N_POINTS, N_FEATURES, rstate)
+
+        kwargs = dict(DEFAULT_KWARGS)
+        kwargs['fit_n_partial'] = 1500
+        del kwargs['max_observations']
+        print kwargs
+        print 'N_POINTS', N_POINTS
+
+        clf = BinaryASGD(*DEFAULT_ARGS, rstate=rstate, **kwargs)
+
+        clf.fit(X, y)
+        print clf.fit_n_partial
+        print len(clf.train_means)
+        print clf.n_observations
+        assert clf.fit_n_partial == kwargs['fit_n_partial']
+        assert len(clf.train_means) == clf.n_observations / clf.fit_n_partial
+        assert clf.min_observations == kwargs['min_observations']
+        assert clf.max_observations == sys.maxint
+        assert clf.n_observations >= kwargs['min_observations']
+        assert clf.fit_converged()
+        ytrn_preds = clf.predict(X)
+        ytrn_acc = (ytrn_preds == y).mean()
+        assert ytrn_acc > 0.7

From 2e90505b1170d2ce050122480f01f445128222cf Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 31 Jan 2012 18:55:24 -0500
Subject: [PATCH 03/18] better docstring for binary_fit

---
 asgd/auto_step_size.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py
index 3045520..3ab1209 100644
--- a/asgd/auto_step_size.py
+++ b/asgd/auto_step_size.py
@@ -85,7 +85,8 @@ def binary_fit(
 
     max_examples: int
         Maximum number of examples to use from `X` and `y` to find an
-        estimate of the best sgd_step_size0
+        estimate of the best sgd_step_size0. N.B. That the entirety of X and y
+        is used for the final fit() call after the best step size has been found.
 
     Returns
     -------

From 0db2514f268bb4ddc2392285849cd8dd79c91860 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 31 Jan 2012 18:55:42 -0500
Subject: [PATCH 04/18] NaiveBinaryASGD assert labels in +-1

---
 asgd/naive_asgd.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py
index 8a3bec2..36fa1a0 100644
--- a/asgd/naive_asgd.py
+++ b/asgd/naive_asgd.py
@@ -103,6 +103,8 @@ class NaiveBinaryASGD(BaseBinaryASGD):
 
     def partial_fit(self, X, y):
 
+        assert np.all(y**2 == 1)
+
         sgd_step_size0 = self.sgd_step_size0
         sgd_step_size = self.sgd_step_size
         sgd_step_size_scheduling_exponent = \

From 2142727ae708126f72931293a12e69198408a53c Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Fri, 3 Feb 2012 20:42:08 -0500
Subject: [PATCH 05/18] early stopping tolerance in terms of abs and rel
 tolerance

---
 asgd/naive_asgd.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py
index 36fa1a0..df227dc 100644
--- a/asgd/naive_asgd.py
+++ b/asgd/naive_asgd.py
@@ -15,8 +15,9 @@
 DEFAULT_FEEDBACK = False
 DEFAULT_RSTATE = None
 DEFAULT_DTYPE = np.float32
-DEFAULT_N_PARTIAL = 1000
-DEFAULT_FIT_TOLERANCE = 1e-2
+DEFAULT_N_PARTIAL = 5000
+DEFAULT_FIT_ABS_TOLERANCE = 1e-4
+DEFAULT_FIT_REL_TOLERANCE = 1e-2
 
 
 class BaseASGD(object):
@@ -27,7 +28,8 @@ def __init__(self, n_features,
                  min_observations=DEFAULT_MIN_OBSERVATIONS,
                  max_observations=DEFAULT_MAX_OBSERVATIONS,
                  fit_n_partial=DEFAULT_N_PARTIAL,
-                 fit_tolerance=DEFAULT_FIT_TOLERANCE,
+                 fit_abs_tolerance=DEFAULT_FIT_ABS_TOLERANCE,
+                 fit_rel_tolerance=DEFAULT_FIT_REL_TOLERANCE,
                  feedback=DEFAULT_FEEDBACK,
                  rstate=DEFAULT_RSTATE,
                  dtype=DEFAULT_DTYPE):
@@ -43,7 +45,8 @@ def __init__(self, n_features,
         self.max_observations = max_observations
 
         self.fit_n_partial = fit_n_partial
-        self.fit_tolerance = fit_tolerance
+        self.fit_abs_tolerance = fit_abs_tolerance
+        self.fit_rel_tolerance = fit_rel_tolerance
 
         if feedback:
             raise NotImplementedError("FIXME: feedback support is buggy")
@@ -73,9 +76,11 @@ def __init__(self, n_features,
 
     def fit_converged(self):
         train_means = self.train_means
+        rel_tol = self.fit_rel_tolerance
+        abs_tol = self.fit_abs_tolerance
         if len(train_means) > 2:
             midpt = len(train_means) // 2
-            thresh = (1 - self.fit_tolerance) * train_means[midpt]
+            thresh = (1 - rel_tol) * train_means[midpt] - abs_tol
             return train_means[-1] > thresh
         return False
 
@@ -221,7 +226,8 @@ def __init__(self, n_classes, n_features,
                  min_observations=DEFAULT_MIN_OBSERVATIONS,
                  max_observations=DEFAULT_MAX_OBSERVATIONS,
                  fit_n_partial=DEFAULT_N_PARTIAL,
-                 fit_tolerance=DEFAULT_FIT_TOLERANCE,
+                 fit_abs_tolerance=DEFAULT_FIT_ABS_TOLERANCE,
+                 fit_rel_tolerance=DEFAULT_FIT_REL_TOLERANCE,
                  feedback=DEFAULT_FEEDBACK,
                  rstate=DEFAULT_RSTATE,
                  dtype=DEFAULT_DTYPE):
@@ -233,7 +239,8 @@ def __init__(self, n_classes, n_features,
             min_observations=min_observations,
             max_observations=max_observations,
             fit_n_partial=fit_n_partial,
-            fit_tolerance=fit_tolerance,
+            fit_abs_tolerance=fit_abs_tolerance,
+            fit_rel_tolerance=fit_rel_tolerance,
             feedback=feedback,
             rstate=rstate,
             dtype=dtype,

From 4de0d6c8520f1ad4a2d291c499a96a0541267463 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Fri, 3 Feb 2012 20:42:35 -0500
Subject: [PATCH 06/18] improvements to binary_fit

---
 asgd/auto_step_size.py            | 49 +++++++++++++++----------------
 asgd/tests/test_auto_step_size.py | 25 ++++++++--------
 2 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py
index 3ab1209..ce082b1 100644
--- a/asgd/auto_step_size.py
+++ b/asgd/auto_step_size.py
@@ -3,13 +3,14 @@
 from scipy import optimize
 
 DEFAULT_INITIAL_RANGE = 0.25, 0.5
-DEFAULT_MAX_EXAMPLES = 1000
-DEFAULT_TOLERANCE = 0.5
+DEFAULT_MAX_EXAMPLES = 1000  # estimate stepsize from this many examples
+DEFAULT_TOLERANCE = 0.01     # in logarithmic units of the training criterion
 DEFAULT_BRENT_OUTPUT = False
 
 
+
 def find_sgd_step_size0(
-    model, X, y,
+    model, partial_fit_args,
     initial_range=DEFAULT_INITIAL_RANGE,
     tolerance=DEFAULT_TOLERANCE, brent_output=DEFAULT_BRENT_OUTPUT):
     """Use a Brent line search to find the best step size
@@ -19,11 +20,8 @@ def find_sgd_step_size0(
     model: BinaryASGD
         Instance of a BinaryASGD
 
-    X: array, shape = [n_samples, n_features]
-        Array of features
-
-    y: array, shape = [n_samples]
-        Array of labels in (-1, 1)
+    partial_fit_args - tuple of arguments for model.partial_fit.
+        This tuple must start with X, y, ...
 
     initial_range: tuple of float
         Initial range for the sgd_step_size0 search (low, high)
@@ -48,26 +46,31 @@ def eval_size0(log2_size0):
         current_step_size = 2 ** log2_size0
         other.sgd_step_size0 = current_step_size
         other.sgd_step_size = current_step_size
-        other.partial_fit(X, y)
+        other.partial_fit(*partial_fit_args)
         # Hack: asgd is lower variance than sgd, but it's tuned to work
         # well asymptotically, not after just a few examples
         weights = .5 * (other.asgd_weights + other.sgd_weights)
         bias = .5 * (other.asgd_bias + other.sgd_bias)
 
+        X, y = partial_fit_args[:2]
         margin = y * (np.dot(X, weights) + bias)
         l2_cost = other.l2_regularization * (weights ** 2).sum()
         rval = np.maximum(0, 1 - margin).mean() + l2_cost
+        if np.isnan(rval):
+            rval = float('inf')
+        # -- apply minimizer in log domain
+        rval = np.log(rval)
         _cache[log2_size0] = rval
         return rval
 
-    best_sgd_step_size0 = optimize.brent(
+    log2_best_sgd_step_size0 = optimize.brent(
         eval_size0, brack=np.log2(initial_range), tol=tolerance)
 
-    return best_sgd_step_size0
+    return 2 ** log2_best_sgd_step_size0
 
 
 def binary_fit(
-    model, X, y,
+    model, fit_args,
     max_examples=DEFAULT_MAX_EXAMPLES,
     **find_sgd_step_size0_kwargs):
     """Returns a model with automatically-selected sgd_step_size0
@@ -77,11 +80,8 @@ def binary_fit(
     model: BinaryASGD
         Instance of the model to be fitted.
 
-    X: array, shape = [n_samples, n_features]
-        Array of features
-
-    y: array, shape = [n_samples]
-        Array of labels in (-1, 1)
+    fit_args - tuple of args to model.fit
+        This method assumes they are all length-of-dataset ndarrays.
 
     max_examples: int
         Maximum number of examples to use from `X` and `y` to find an
@@ -95,23 +95,22 @@ def binary_fit(
         sgd_step_size0
     """
 
-    assert X.ndim == 2
-    assert len(X) == len(y)
     assert max_examples > 0
 
     # randomly choose up to max_examples uniformly without replacement from
     # across the whole set of training data.
-    idxs = model.rstate.permutation(len(X))[:max_examples]
+    all_idxs = model.rstate.permutation(len(fit_args[0]))
+    idxs = all_idxs[:max_examples]
 
     # Find the best learning rate for that subset
     best = find_sgd_step_size0(
-        model, X[idxs], y[idxs], **find_sgd_step_size0_kwargs)
+        model, [a[idxs] for a in fit_args], **find_sgd_step_size0_kwargs)
 
     # Heuristic: take the best stepsize according to the first max_examples,
     # and go half that fast for the full run.
-    best_estimate = 2. ** (best - 1.0)
-    model.sgd_step_size0 = best_estimate
-    model.sgd_step_size = best_estimate
-    model.fit(X, y)
+    stepdown = 5 * np.sqrt( float(len(all_idxs)) / float(len(idxs)))
+    model.sgd_step_size0 = best / stepdown
+    model.sgd_step_size = best / stepdown
+    model.fit(*fit_args)
 
     return model
diff --git a/asgd/tests/test_auto_step_size.py b/asgd/tests/test_auto_step_size.py
index 5090837..38e4d1a 100644
--- a/asgd/tests/test_auto_step_size.py
+++ b/asgd/tests/test_auto_step_size.py
@@ -27,12 +27,13 @@ def test_binary_sgd_step_size0():
     X, y = get_fake_data(100, n_features, rstate)
 
     clf = get_new_model(n_features, rstate, 100)
-    best = find_sgd_step_size0(clf, X, y, (.25, .5))
-    assert_almost_equal(best, -4.9927, decimal=4)
+    best0 = find_sgd_step_size0(clf, (X, y), (.25, .5), tolerance=1e-6)
 
     # start a little lower, still works
-    best = find_sgd_step_size0(clf, X, y, (.125, .25))
-    assert_almost_equal(best, -4.6180, decimal=4)
+    best1 = find_sgd_step_size0(clf, (X, y), (.125, .25), tolerance=1e-6)
+    print best0, best1
+    assert_almost_equal(best0, 0.03501, decimal=4)
+    assert_almost_equal(best1, 0.03501, decimal=4)
 
     # find_sgd_step_size0 does not change clf
     assert clf.sgd_step_size0 == 1000.0
@@ -44,23 +45,23 @@ def test_binary_fit():
 
     clf100 = get_new_model(n_features, rstate, 100)
     X, y = get_fake_data(100, n_features, rstate)
-    _clf100 = binary_fit(clf100, X, y)
+    _clf100 = binary_fit(clf100, (X, y))
     assert _clf100 is clf100
-    assert_almost_equal(clf100.sgd_step_size0, 0.04812, decimal=4)
+    assert_almost_equal(clf100.sgd_step_size0, 0.00954, decimal=4)
 
     # smoke test
     clf1000 = get_new_model(n_features, rstate, 1000)
     X, y = get_fake_data(DEFAULT_MAX_EXAMPLES, n_features, rstate)
-    _clf1000 = binary_fit(clf1000, X, y)
+    _clf1000 = binary_fit(clf1000, (X, y))
     assert _clf1000 is clf1000
-    assert_almost_equal(clf1000.sgd_step_size0, 0.0047, decimal=4)
+    assert_almost_equal(clf1000.sgd_step_size0, 0.00201, decimal=4)
 
     # smoke test that at least it runs
     clf2000 = get_new_model(n_features, rstate, 2000)
     X, y = get_fake_data(2000, n_features, rstate)
-    _clf2000 = binary_fit(clf2000, X, y)
+    _clf2000 = binary_fit(clf2000, (X, y))
     assert _clf2000 == clf2000
-    assert_almost_equal(clf2000.sgd_step_size0, 0.0067, decimal=4)
+    assert_almost_equal(clf2000.sgd_step_size0, 0.001498, decimal=4)
 
 
 def test_fit_replicable():
@@ -70,10 +71,10 @@ def test_fit_replicable():
     X, y = get_fake_data(100, n_features, RandomState(4))
 
     m0 = get_new_model(n_features, RandomState(45), 100)
-    m0 = binary_fit(m0, X, y)
+    m0 = binary_fit(m0, (X, y))
 
     m1 = get_new_model(n_features, RandomState(45), 100)
-    m1 = binary_fit(m1, X, y)
+    m1 = binary_fit(m1, (X, y))
 
     assert_array_equal(m0.sgd_weights, m1.sgd_weights)
     assert_array_equal(m0.sgd_bias, m1.sgd_bias)

From a56823eaa441dd8f4fa6ded60a825397a9dbcb44 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Fri, 24 Feb 2012 00:37:22 -0500
Subject: [PATCH 07/18] adding logger to binary_fit

---
 asgd/auto_step_size.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py
index ce082b1..9b24c7b 100644
--- a/asgd/auto_step_size.py
+++ b/asgd/auto_step_size.py
@@ -1,7 +1,10 @@
 import copy
+import logging
 import numpy as np
 from scipy import optimize
 
+logger = logging.getLogger(__name__)
+
 DEFAULT_INITIAL_RANGE = 0.25, 0.5
 DEFAULT_MAX_EXAMPLES = 1000  # estimate stepsize from this many examples
 DEFAULT_TOLERANCE = 0.01     # in logarithmic units of the training criterion
@@ -105,10 +108,13 @@ def binary_fit(
     # Find the best learning rate for that subset
     best = find_sgd_step_size0(
         model, [a[idxs] for a in fit_args], **find_sgd_step_size0_kwargs)
+    logger.info('found best: %f' % best)
 
     # Heuristic: take the best stepsize according to the first max_examples,
     # and go half that fast for the full run.
     stepdown = 5 * np.sqrt( float(len(all_idxs)) / float(len(idxs)))
+
+    logger.info('setting sgd_step_size: %f' % (best/stepdown))
     model.sgd_step_size0 = best / stepdown
     model.sgd_step_size = best / stepdown
     model.fit(*fit_args)

From dd5d2b98c8954b059c15f828a9d4e545693bfa53 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Fri, 24 Feb 2012 00:37:46 -0500
Subject: [PATCH 08/18] disallow learning rate smaller than 1e-7

---
 asgd/auto_step_size.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py
index 9b24c7b..dd97209 100644
--- a/asgd/auto_step_size.py
+++ b/asgd/auto_step_size.py
@@ -69,7 +69,8 @@ def eval_size0(log2_size0):
     log2_best_sgd_step_size0 = optimize.brent(
         eval_size0, brack=np.log2(initial_range), tol=tolerance)
 
-    return 2 ** log2_best_sgd_step_size0
+    rval = max(2 ** log2_best_sgd_step_size0, 1e-7)
+    return rval
 
 
 def binary_fit(

From ddf0560021b9547c63d6a7ee53f5ffb091d80018 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 28 Feb 2012 19:43:58 -0500
Subject: [PATCH 09/18] different line search technique for auto step size

---
 asgd/auto_step_size.py | 49 +++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py
index dd97209..d230c97 100644
--- a/asgd/auto_step_size.py
+++ b/asgd/auto_step_size.py
@@ -1,21 +1,18 @@
 import copy
 import logging
 import numpy as np
-from scipy import optimize
+import scipy.optimize
 
 logger = logging.getLogger(__name__)
 
-DEFAULT_INITIAL_RANGE = 0.25, 0.5
 DEFAULT_MAX_EXAMPLES = 1000  # estimate stepsize from this many examples
-DEFAULT_TOLERANCE = 0.01     # in logarithmic units of the training criterion
-DEFAULT_BRENT_OUTPUT = False
-
+DEFAULT_TOLERANCE = 1.0     # in log-2 units of the learning rate
+DEFAULT_SGD_STEP_SIZE_FLOOR = 1e-7  # -- for huge feature vectors, reduce this.
 
 
 def find_sgd_step_size0(
     model, partial_fit_args,
-    initial_range=DEFAULT_INITIAL_RANGE,
-    tolerance=DEFAULT_TOLERANCE, brent_output=DEFAULT_BRENT_OUTPUT):
+    tolerance=DEFAULT_TOLERANCE):
     """Use a Brent line search to find the best step size
 
     Parameters
@@ -26,15 +23,10 @@ def find_sgd_step_size0(
     partial_fit_args - tuple of arguments for model.partial_fit.
         This tuple must start with X, y, ...
 
-    initial_range: tuple of float
-        Initial range for the sgd_step_size0 search (low, high)
-
-    max_iterations:
-        Maximum number of interations
+    tolerance: in logarithmic step size units
 
     Returns
     -------
-    best_sgd_step_size0: float
         Optimal sgd_step_size0 given `X` and `y`.
     """
     # -- stupid scipy calls some sizes twice!?
@@ -42,7 +34,7 @@ def find_sgd_step_size0(
 
     def eval_size0(log2_size0):
         try:
-            return _cache[log2_size0]
+            return _cache[float(log2_size0)]
         except KeyError:
             pass
         other = copy.deepcopy(model)
@@ -58,24 +50,30 @@ def eval_size0(log2_size0):
         X, y = partial_fit_args[:2]
         margin = y * (np.dot(X, weights) + bias)
         l2_cost = other.l2_regularization * (weights ** 2).sum()
+        #print 'Hinge:', np.maximum(0, 1 - margin).mean()
+        #print 'L2', l2_cost
         rval = np.maximum(0, 1 - margin).mean() + l2_cost
         if np.isnan(rval):
             rval = float('inf')
-        # -- apply minimizer in log domain
-        rval = np.log(rval)
-        _cache[log2_size0] = rval
+        #print 'find step %e: %e' % (current_step_size, rval)
         return rval
 
-    log2_best_sgd_step_size0 = optimize.brent(
-        eval_size0, brack=np.log2(initial_range), tol=tolerance)
+    log2_best_sgd_step_size0 = scipy.optimize.fmin(
+            eval_size0,
+            x0=np.log2(model.sgd_step_size0),
+            xtol=tolerance,
+            )
+
+    #print 'Brent came back with', log2_best_sgd_step_size0, 2 ** log2_best_sgd_step_size0
 
-    rval = max(2 ** log2_best_sgd_step_size0, 1e-7)
+    rval = 2.0 ** log2_best_sgd_step_size0
     return rval
 
 
 def binary_fit(
     model, fit_args,
     max_examples=DEFAULT_MAX_EXAMPLES,
+    step_size_floor=DEFAULT_SGD_STEP_SIZE_FLOOR,
     **find_sgd_step_size0_kwargs):
     """Returns a model with automatically-selected sgd_step_size0
 
@@ -109,15 +107,16 @@ def binary_fit(
     # Find the best learning rate for that subset
     best = find_sgd_step_size0(
         model, [a[idxs] for a in fit_args], **find_sgd_step_size0_kwargs)
-    logger.info('found best: %f' % best)
+    logger.info('found best: %e' % best)
 
     # Heuristic: take the best stepsize according to the first max_examples,
     # and go half that fast for the full run.
-    stepdown = 5 * np.sqrt( float(len(all_idxs)) / float(len(idxs)))
+    stepdown = np.sqrt( float(len(all_idxs)) / float(len(idxs)))
+    step_size0 = max(best / stepdown, step_size_floor)
 
-    logger.info('setting sgd_step_size: %f' % (best/stepdown))
-    model.sgd_step_size0 = best / stepdown
-    model.sgd_step_size = best / stepdown
+    logger.info('setting sgd_step_size: %e' % step_size0)
+    model.sgd_step_size0 = float(step_size0)
+    model.sgd_step_size = float(step_size0)
     model.fit(*fit_args)
 
     return model

From 47200288d633036d57bbf418e895c597a0a833c7 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 28 Feb 2012 19:44:39 -0500
Subject: [PATCH 10/18] added perfect-fit convergence condition

---
 asgd/naive_asgd.py | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py
index df227dc..d14a9db 100644
--- a/asgd/naive_asgd.py
+++ b/asgd/naive_asgd.py
@@ -75,13 +75,34 @@ def __init__(self, n_features,
         self.recent_train_costs = []
 
     def fit_converged(self):
+        """
+        There are two convergence tests here. Training is considered to be
+        over if it has
+
+        * stalled: latest train_means is allclose to a previous one (*)
+
+        * terminated: latest train_means is allclose to 0
+
+        """
         train_means = self.train_means
-        rel_tol = self.fit_rel_tolerance
-        abs_tol = self.fit_abs_tolerance
+        rtol = self.fit_rel_tolerance
+        atol = self.fit_abs_tolerance
+        assert np.min(train_means) >= 0
+
+        # -- check for perfect fit
+        if len(train_means) > 1:
+            if np.allclose(train_means[-1], 0, atol=atol, rtol=rtol):
+                return True
+
+        # -- check for stall condition
         if len(train_means) > 2:
-            midpt = len(train_means) // 2
-            thresh = (1 - rel_tol) * train_means[midpt] - abs_tol
-            return train_means[-1] > thresh
+            old_pt = max(
+                    len(train_means) // 2,
+                    len(train_means) - 4)
+            thresh = (1 - rtol) * train_means[old_pt] - atol
+            if train_means[-1] > thresh:
+                return True
+
         return False
 
 

From 73db426c54d6767591c2c652a1f3f6b18e2f77bc Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 28 Feb 2012 19:58:43 -0500
Subject: [PATCH 11/18] gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 4af44bf..a67e6ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.pyc
 *.so
 *.egg-info
+*.swp

From cf54011c10e74ff638c4f7db519b15d838deca46 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Wed, 29 Feb 2012 01:11:49 -0500
Subject: [PATCH 12/18] bug in position of new assert

---
 asgd/naive_asgd.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py
index d14a9db..4ba5167 100644
--- a/asgd/naive_asgd.py
+++ b/asgd/naive_asgd.py
@@ -87,10 +87,10 @@ def fit_converged(self):
         train_means = self.train_means
         rtol = self.fit_rel_tolerance
         atol = self.fit_abs_tolerance
-        assert np.min(train_means) >= 0
 
         # -- check for perfect fit
         if len(train_means) > 1:
+            assert np.min(train_means) >= 0
             if np.allclose(train_means[-1], 0, atol=atol, rtol=rtol):
                 return True
 
@@ -149,7 +149,6 @@ def partial_fit(self, X, y):
         n_observations = self.n_observations
         train_means = self.train_means
         recent_train_costs = self.recent_train_costs
-        min_observations = self.min_observations
 
         for obs, label in izip(X, y):
 

From 6c5b8483891052b03286e708c04e7408b2086856 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Mon, 19 Mar 2012 18:45:52 -0400
Subject: [PATCH 13/18] adding NaiveRankASGD

---
 asgd/__init__.py       |   4 +-
 asgd/auto_step_size.py |  11 +-
 asgd/naive_asgd.py     | 278 +++++++++++++++++++++++++++--------------
 3 files changed, 189 insertions(+), 104 deletions(-)

diff --git a/asgd/__init__.py b/asgd/__init__.py
index eabbd9e..08d1272 100644
--- a/asgd/__init__.py
+++ b/asgd/__init__.py
@@ -1,3 +1,5 @@
-from naive_asgd import NaiveBinaryASGD, NaiveOVAASGD
+from naive_asgd import NaiveBinaryASGD
+from naive_asgd import NaiveOVAASGD
+from naive_asgd import NaiveRankASGD
 
 from experimental_asgd import ExperimentalBinaryASGD
diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py
index d230c97..dfb8af2 100644
--- a/asgd/auto_step_size.py
+++ b/asgd/auto_step_size.py
@@ -44,15 +44,11 @@ def eval_size0(log2_size0):
         other.partial_fit(*partial_fit_args)
         # Hack: asgd is lower variance than sgd, but it's tuned to work
         # well asymptotically, not after just a few examples
-        weights = .5 * (other.asgd_weights + other.sgd_weights)
-        bias = .5 * (other.asgd_bias + other.sgd_bias)
+        other.asgd_weights = .5 * (other.asgd_weights + other.sgd_weights)
+        other.asgd_bias = .5 * (other.asgd_bias + other.sgd_bias)
 
         X, y = partial_fit_args[:2]
-        margin = y * (np.dot(X, weights) + bias)
-        l2_cost = other.l2_regularization * (weights ** 2).sum()
-        #print 'Hinge:', np.maximum(0, 1 - margin).mean()
-        #print 'L2', l2_cost
-        rval = np.maximum(0, 1 - margin).mean() + l2_cost
+        rval = other.cost(X, y)
         if np.isnan(rval):
             rval = float('inf')
         #print 'find step %e: %e' % (current_step_size, rval)
@@ -70,6 +66,7 @@ def eval_size0(log2_size0):
     return rval
 
 
+# XXX: use different name, function is not specific to binary classification
 def binary_fit(
     model, fit_args,
     max_examples=DEFAULT_MAX_EXAMPLES,
diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py
index 4ba5167..259252e 100644
--- a/asgd/naive_asgd.py
+++ b/asgd/naive_asgd.py
@@ -105,6 +105,42 @@ def fit_converged(self):
 
         return False
 
+    def fit(self, X, y):
+
+        assert X.ndim == 2
+        assert y.ndim == 1
+
+        n_points, n_features = X.shape
+        assert n_features == self.n_features
+        assert n_points == y.size
+
+        n_points_remaining = self.max_observations - self.n_observations
+
+        while n_points_remaining > 0:
+
+            # -- every iteration will train from n_partial observations and
+            # then check for convergence
+            fit_n_partial = min(n_points_remaining, self.fit_n_partial)
+
+            idx = self.rstate.permutation(n_points)
+            Xb = X[idx[:fit_n_partial]]
+            yb = y[idx[:fit_n_partial]]
+            self.partial_fit(Xb, yb)
+
+            if self.feedback:
+                raise NotImplementedError(
+                    'partial_fit logic requires memory to be distinct')
+                self.sgd_weights = self.asgd_weights
+                self.sgd_bias = self.asgd_bias
+
+            if (self.n_observations >= self.min_observations
+                    and self.fit_converged()):
+                break
+
+            n_points_remaining -= len(Xb)
+
+        return self
+
 
 class BaseBinaryASGD(BaseASGD):
 
@@ -124,6 +160,14 @@ def decision_function(self, X):
     def predict(self, X):
         return np.sign(self.decision_function(X))
 
+    def cost(self, X, y):
+        weights = self.asgd_weights
+        bias = self.asgd_bias
+        margin = y * (np.dot(X, weights) + bias)
+        l2_cost = self.l2_regularization * (weights ** 2).sum()
+        rval = np.maximum(0, 1 - margin).mean() + l2_cost
+        return rval
+
 
 class NaiveBinaryASGD(BaseBinaryASGD):
 
@@ -201,80 +245,33 @@ def partial_fit(self, X, y):
 
         return self
 
-    def fit(self, X, y):
-
-        assert X.ndim == 2
-        assert y.ndim == 1
-
-        n_points, n_features = X.shape
-        assert n_features == self.n_features
-        assert n_points == y.size
-
-        n_points_remaining = self.max_observations - self.n_observations
-
-        while n_points_remaining > 0:
-
-            # -- every iteration will train from n_partial observations and
-            # then check for convergence
-            fit_n_partial = min(n_points_remaining, self.fit_n_partial)
-
-            idx = self.rstate.permutation(n_points)
-            Xb = X[idx[:fit_n_partial]]
-            yb = y[idx[:fit_n_partial]]
-            self.partial_fit(Xb, yb)
-
-            if self.feedback:
-                raise NotImplementedError(
-                    'partial_fit logic requires memory to be distinct')
-                self.sgd_weights = self.asgd_weights
-                self.sgd_bias = self.asgd_bias
-
-            if (self.n_observations >= self.min_observations
-                    and self.fit_converged()):
-                break
-
-            n_points_remaining -= len(Xb)
-
-        return self
-
 
-class NaiveOVAASGD(BaseASGD):
-
-    def __init__(self, n_classes, n_features,
-                 sgd_step_size0=DEFAULT_SGD_STEP_SIZE0,
-                 l2_regularization=DEFAULT_L2_REGULARIZATION,
-                 min_observations=DEFAULT_MIN_OBSERVATIONS,
-                 max_observations=DEFAULT_MAX_OBSERVATIONS,
-                 fit_n_partial=DEFAULT_N_PARTIAL,
-                 fit_abs_tolerance=DEFAULT_FIT_ABS_TOLERANCE,
-                 fit_rel_tolerance=DEFAULT_FIT_REL_TOLERANCE,
-                 feedback=DEFAULT_FEEDBACK,
-                 rstate=DEFAULT_RSTATE,
-                 dtype=DEFAULT_DTYPE):
-
-        super(NaiveOVAASGD, self).__init__(
-            n_features,
-            sgd_step_size0=sgd_step_size0,
-            l2_regularization=l2_regularization,
-            min_observations=min_observations,
-            max_observations=max_observations,
-            fit_n_partial=fit_n_partial,
-            fit_abs_tolerance=fit_abs_tolerance,
-            fit_rel_tolerance=fit_rel_tolerance,
-            feedback=feedback,
-            rstate=rstate,
-            dtype=dtype,
-            )
+class BaseMultiASGD(BaseASGD):
+    """
+    Allocate weight vectors for one-vs-all multiclass SVM
+    """
+    def __init__(self, n_classes, n_features, *args, **kwargs):
+        BaseASGD.__init__(self, n_features, *args, **kwargs)
 
         # --
-        assert n_classes > 1
         self.n_classes = n_classes
 
+        dtype = self.dtype
         # --
         self.sgd_weights = np.zeros((n_features, n_classes), dtype=dtype)
-        self.sgd_bias = np.zeros((n_classes,), dtype=dtype)
+        self.sgd_bias = np.zeros(n_classes, dtype=dtype)
         self.asgd_weights = np.zeros((n_features, n_classes), dtype=dtype)
-        self.asgd_bias = np.zeros((n_classes), dtype=dtype)
+        self.asgd_bias = np.zeros(n_classes, dtype=dtype)
+
+    def decision_function(self, X):
+        X = np.asarray(X)
+        return dot(X, self.asgd_weights) + self.asgd_bias
+
+    def predict(self, X):
+        return self.decision_function(X).argmax(axis=1)
+
+
+class NaiveOVAASGD(BaseMultiASGD):
 
     def partial_fit(self, X, y):
 
@@ -299,6 +296,9 @@ def partial_fit(self, X, y):
         n_observations = self.n_observations
         n_classes = self.n_classes
 
+        train_means = self.train_means
+        recent_train_costs = self.recent_train_costs
+
         for obs, label in izip(X, y):
             label = 2 * (np.arange(n_classes) == label).astype(int) - 1
 
@@ -310,13 +310,19 @@ def partial_fit(self, X, y):
                 sgd_weights *= (1 - l2_regularization * sgd_step_size)
 
             violations = margin < 1
-            label_violated = label[violations]
-            sgd_weights[:, violations] += (
-                sgd_step_size
-                * label_violated[np.newaxis, :]
-                * obs[:, np.newaxis]
-            )
-            sgd_bias[violations] += sgd_step_size * label_violated
+
+            if violations:
+                label_violated = label[violations]
+                sgd_weights[:, violations] += (
+                    sgd_step_size
+                    * label_violated[np.newaxis, :]
+                    * obs[:, np.newaxis]
+                )
+                sgd_bias[violations] += sgd_step_size * label_violated
+                recent_train_costs.append(
+                        n_classes - margin[violations].sum())
+            else:
+                recent_train_costs.append(0.0)
 
             # -- update asgd
             asgd_weights = (1 - asgd_step_size) * asgd_weights \
@@ -333,6 +339,12 @@ def partial_fit(self, X, y):
                      sgd_step_size_scheduling_exponent)
             asgd_step_size = 1. / n_observations
 
+            if len(recent_train_costs) == self.fit_n_partial:
+                train_means.append(np.mean(recent_train_costs)
+                        + l2_regularization * np.dot(
+                            self.asgd_weights, self.asgd_weights))
+                self.recent_train_costs = recent_train_costs = []
+
         # --
         self.sgd_weights = sgd_weights
         self.sgd_bias = sgd_bias
@@ -346,38 +358,112 @@ def partial_fit(self, X, y):
 
         return self
 
-    def fit(self, X, y):
 
-        assert X.ndim == 2
-        assert y.ndim == 1
+class NaiveRankASGD(BaseMultiASGD):
+    """
+    Implements rank-based multiclass SVM.
+    """
 
-        n_points, n_features = X.shape
-        assert n_features == self.n_features
-        assert n_points == y.size
+    def partial_fit(self, X, y):
 
+        if set(y) > set(range(self.n_classes)):
+            raise ValueError("Invalid 'y'")
 
-        n_points_remaining = self.max_observations - self.n_observations
+        sgd_step_size0 = self.sgd_step_size0
+        sgd_step_size = self.sgd_step_size
+        sgd_step_size_scheduling_exponent = \
+                self.sgd_step_size_scheduling_exponent
+        sgd_step_size_scheduling_multiplier = \
+                self.sgd_step_size_scheduling_multiplier
+        sgd_weights = self.sgd_weights
+        sgd_bias = self.sgd_bias
 
-        while n_points_remaining > 0:
-            # -- every iteration will train from n_partial observations and
-            # then check for convergence
-            fit_n_partial = min(n_points_remaining, self.fit_n_partial)
+        asgd_weights = self.asgd_weights
+        asgd_bias = self.asgd_bias
+        asgd_step_size = self.asgd_step_size
 
-            idx = self.rstate.permutation(n_points)
-            Xb = X[idx[:fit_n_partial]]
-            yb = y[idx[:fit_n_partial]]
-            self.partial_fit(Xb, yb)
+        l2_regularization = self.l2_regularization
 
-            if self.feedback:
-                self.sgd_weights = self.asgd_weights
-                self.sgd_bias = self.asgd_bias
+        n_observations = self.n_observations
+        n_classes = self.n_classes
 
-            n_points_remaining -= len(Xb)
+        train_means = self.train_means
+        recent_train_costs = self.recent_train_costs
+
+        for obs, label in izip(X, y):
+
+            # -- compute margin
+            decision = dot(obs, sgd_weights) + sgd_bias
+
+            # -- update sgd
+            if l2_regularization:
+                sgd_weights *= (1 - l2_regularization * sgd_step_size)
+
+            dsrt = np.argsort(decision)
+            distractor = dsrt[-2] if dsrt[-1] is label else dsrt[-1]
+            margin = decision[label] - decision[distractor]
+
+            if margin < 1:
+                winc = sgd_step_size * obs
+                sgd_weights[:, distractor] -= winc
+                sgd_weights[:, label] += winc
+                sgd_bias[distractor] -= sgd_step_size
+                sgd_bias[label] += sgd_step_size
+                recent_train_costs.append(1 - float(margin))
+            else:
+                recent_train_costs.append(0.0)
+
+            # -- update asgd
+            # -- TODO: delay updates until sgd_weights are modified,
+            #          only 2 rows change per iteration
+            asgd_weights = (1 - asgd_step_size) * asgd_weights \
+                    + asgd_step_size * sgd_weights
+            asgd_bias = (1 - asgd_step_size) * asgd_bias \
+                    + asgd_step_size * sgd_bias
+
+            # -- update step_sizes
+            n_observations += 1
+            sgd_step_size_scheduling = (1 + sgd_step_size0 * n_observations *
+                                        sgd_step_size_scheduling_multiplier)
+            sgd_step_size = sgd_step_size0 / \
+                    (sgd_step_size_scheduling ** \
+                     sgd_step_size_scheduling_exponent)
+            asgd_step_size = 1. / n_observations
+
+            if len(recent_train_costs) == self.fit_n_partial:
+                flat_weights = asgd_weights.flatten()
+                l2_term = np.dot(flat_weights, flat_weights)
+                train_means.append(np.mean(recent_train_costs)
+                        + l2_regularization * l2_term)
+                self.recent_train_costs = recent_train_costs = []
+
+        # --
+        self.sgd_weights = sgd_weights
+        self.sgd_bias = sgd_bias
+        self.sgd_step_size = sgd_step_size
+
+        self.asgd_weights = asgd_weights
+        self.asgd_bias = asgd_bias
+        self.asgd_step_size = asgd_step_size
+
+        self.n_observations = n_observations
 
         return self
 
-    def decision_function(self, X):
-        return dot(X, self.asgd_weights) + self.asgd_bias
 
-    def predict(self, X):
-        return self.decision_function(X).argmax(1)
+    def cost(self, X, y):
+        weights = self.asgd_weights
+        bias = self.asgd_bias
+        decisions = dot(X, weights) + bias
+        margins = []
+        for decision, label in zip(decisions, y):
+            dsrt = np.argsort(decision)
+            distractor = dsrt[-2] if dsrt[-1] is label else dsrt[-1]
+            margin = decision[label] - decision[distractor]
+            margins.append(margin)
+
+        flat_weights = weights.flatten()
+        l2_term = np.dot(flat_weights, flat_weights)
+        l2_cost = self.l2_regularization * l2_term
+        rval = np.maximum(0, 1 - np.asarray(margins)).mean() + l2_cost
+        return rval

From 23a859701dd92a362d9a09b0f9e8aeab251ce238 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 20 Mar 2012 16:48:59 -0400
Subject: [PATCH 14/18] adding SparseUpdateASGD

---
 asgd/__init__.py       |   1 +
 asgd/auto_step_size.py |   7 +-
 asgd/naive_asgd.py     | 176 ++++++++++++++++++++++++++++++++++++++---
 3 files changed, 173 insertions(+), 11 deletions(-)

diff --git a/asgd/__init__.py b/asgd/__init__.py
index 08d1272..b9a13ea 100644
--- a/asgd/__init__.py
+++ b/asgd/__init__.py
@@ -1,5 +1,6 @@
 from naive_asgd import NaiveBinaryASGD
 from naive_asgd import NaiveOVAASGD
 from naive_asgd import NaiveRankASGD
+from naive_asgd import SparseUpdateRankASGD
 
 from experimental_asgd import ExperimentalBinaryASGD
diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py
index dfb8af2..f7d804e 100644
--- a/asgd/auto_step_size.py
+++ b/asgd/auto_step_size.py
@@ -1,5 +1,6 @@
 import copy
 import logging
+import time
 import numpy as np
 import scipy.optimize
 
@@ -102,9 +103,11 @@ def binary_fit(
     idxs = all_idxs[:max_examples]
 
     # Find the best learning rate for that subset
+    t0 = time.time()
     best = find_sgd_step_size0(
         model, [a[idxs] for a in fit_args], **find_sgd_step_size0_kwargs)
-    logger.info('found best: %e' % best)
+    logger.info('found best stepsize %e in %f seconds' % (
+        best, time.time() - t0))
 
     # Heuristic: take the best stepsize according to the first max_examples,
     # and go half that fast for the full run.
@@ -114,6 +117,8 @@ def binary_fit(
     logger.info('setting sgd_step_size: %e' % step_size0)
     model.sgd_step_size0 = float(step_size0)
     model.sgd_step_size = float(step_size0)
+    t0 = time.time()
     model.fit(*fit_args)
+    logger.info('full fit took %f seconds' % (time.time() - t0))
 
     return model
diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py
index 259252e..b73a671 100644
--- a/asgd/naive_asgd.py
+++ b/asgd/naive_asgd.py
@@ -365,7 +365,6 @@ class NaiveRankASGD(BaseMultiASGD):
     """
 
     def partial_fit(self, X, y):
-
         if set(y) > set(range(self.n_classes)):
             raise ValueError("Invalid 'y'")
 
@@ -395,14 +394,14 @@ def partial_fit(self, X, y):
             # -- compute margin
             decision = dot(obs, sgd_weights) + sgd_bias
 
-            # -- update sgd
-            if l2_regularization:
-                sgd_weights *= (1 - l2_regularization * sgd_step_size)
-
             dsrt = np.argsort(decision)
-            distractor = dsrt[-2] if dsrt[-1] is label else dsrt[-1]
+            distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1]
             margin = decision[label] - decision[distractor]
 
+            # -- update sgd
+            if l2_regularization:
+                sgd_weights *= 1 - l2_regularization * sgd_step_size
+
             if margin < 1:
                 winc = sgd_step_size * obs
                 sgd_weights[:, distractor] -= winc
@@ -414,8 +413,6 @@ def partial_fit(self, X, y):
                 recent_train_costs.append(0.0)
 
             # -- update asgd
-            # -- TODO: delay updates until sgd_weights are modified,
-            #          only 2 rows change per iteration
             asgd_weights = (1 - asgd_step_size) * asgd_weights \
                     + asgd_step_size * sgd_weights
             asgd_bias = (1 - asgd_step_size) * asgd_bias \
@@ -450,15 +447,173 @@ def partial_fit(self, X, y):
 
         return self
 
+    def cost(self, X, y):
+        weights = self.asgd_weights
+        bias = self.asgd_bias
+        decisions = dot(np.asarray(X), weights) + bias
+        margins = []
+        for decision, label in zip(decisions, y):
+            dsrt = np.argsort(decision)
+            distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1]
+            margin = decision[label] - decision[distractor]
+            margins.append(margin)
+
+        flat_weights = weights.flatten()
+        l2_term = np.dot(flat_weights, flat_weights)
+        l2_cost = self.l2_regularization * l2_term
+        rval = np.maximum(0, 1 - np.asarray(margins)).mean() + l2_cost
+        return rval
+
+
+#import line_profiler
+#profile = line_profiler.LineProfiler()
+#import time
+
+
+class SparseUpdateRankASGD(BaseMultiASGD):
+    """
+    Implements rank-based multiclass SVM.
+    """
+
+    #@profile
+    def partial_fit(self, X, y):
+        if set(y) > set(range(self.n_classes)):
+            raise ValueError("Invalid 'y'")
+        #ttt = time.time()
+
+        sgd_step_size0 = self.sgd_step_size0
+        sgd_step_size = self.sgd_step_size
+        sgd_step_size_scheduling_exponent = \
+                self.sgd_step_size_scheduling_exponent
+        sgd_step_size_scheduling_multiplier = \
+                self.sgd_step_size_scheduling_multiplier
+        sgd_weights = np.asarray(self.sgd_weights, order='F')
+        sgd_bias = self.sgd_bias
+
+        asgd_weights = np.asarray(self.asgd_weights, order='F')
+        asgd_bias = self.asgd_bias
+        asgd_step_size = self.asgd_step_size
+
+        l2_regularization = self.l2_regularization
+
+        n_observations = self.n_observations
+        n_classes = self.n_classes
+
+        train_means = self.train_means
+        recent_train_costs = self.recent_train_costs
+
+        # -- the logical sgd_weight matrix is sgd_weights_scale * sgd_weights
+        sgd_weights_scale = 1.0
+
+        use_asgd_scale = True
+
+        if use_asgd_scale:
+            # -- the logical asgd_weights is stored as a row-wise linear
+            # -- combination of asgd_weights and sgd_weights
+            asgd_scale = np.ones((n_classes, 2), dtype=asgd_weights.dtype)
+            asgd_scale[:, 1] = 0  # -- originally there is no sgd contribution
+
+        for obs, label in izip(X, y):
+
+            # -- compute margin
+            decision = sgd_weights_scale * dot(obs, sgd_weights) + sgd_bias
+
+            dsrt = np.argsort(decision)
+            distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1]
+            margin = decision[label] - decision[distractor]
+
+            # -- update sgd
+            sgd_weights_scale *= 1 - l2_regularization * sgd_step_size
+
+            if margin < 1:
+                # -- perform the delayed updates to the rows of asgd
+
+                if use_asgd_scale:
+                    # -- XXX: this should be a single BLAS call to axpy
+                    asgd_weights[:, distractor] *= asgd_scale[distractor, 0]
+                    asgd_weights[:, distractor] += \
+                            (asgd_scale[distractor, 1] * sgd_weights_scale) \
+                            * sgd_weights[:, distractor]
+                    asgd_scale[distractor] = [1, 0]
+
+                    # -- XXX: this should be a single BLAS call to axpy
+                    asgd_weights[:, label] *= asgd_scale[label, 0]
+                    asgd_weights[:, label] += \
+                            (asgd_scale[label, 1] * sgd_weights_scale)\
+                            * sgd_weights[:, label]
+                    asgd_scale[label] = [1, 0]
+
+                winc = (sgd_step_size / sgd_weights_scale) * obs
+                sgd_weights[:, distractor] -= winc
+                sgd_weights[:, label] += winc
+                sgd_bias[distractor] -= sgd_step_size
+                sgd_bias[label] += sgd_step_size
+                recent_train_costs.append(1 - float(margin))
+            else:
+                recent_train_costs.append(0.0)
+
+            if use_asgd_scale:
+                # -- update asgd via scale variables
+                asgd_scale *= (1 - asgd_step_size)
+                asgd_scale[:, 1] += asgd_step_size
+            else:
+                asgd_weights = (1 - asgd_step_size) * asgd_weights \
+                        + asgd_step_size * sgd_weights
+
+            asgd_bias = (1 - asgd_step_size) * asgd_bias \
+                    + asgd_step_size * sgd_bias
+
+            # -- update step_sizes
+            n_observations += 1
+            sgd_step_size_scheduling = (1 + sgd_step_size0 * n_observations *
+                                        sgd_step_size_scheduling_multiplier)
+            sgd_step_size = sgd_step_size0 / \
+                    (sgd_step_size_scheduling ** \
+                     sgd_step_size_scheduling_exponent)
+            asgd_step_size = 1. / n_observations
+
+            if len(recent_train_costs) == self.fit_n_partial:
+                if use_asgd_scale:
+                    asgd_weights[:] *= asgd_scale[:, 0]
+                    asgd_weights[:] += (sgd_weights_scale * asgd_scale[:, 1]) * sgd_weights
+                    asgd_scale[:, 0] = 1
+                    asgd_scale[:, 1] = 0
+
+                flat_weights = asgd_weights.flatten()
+                l2_term = np.dot(flat_weights, flat_weights)
+                train_means.append(np.mean(recent_train_costs)
+                        + l2_regularization * l2_term)
+                self.recent_train_costs = recent_train_costs = []
+
+        # --
+        self.sgd_weights = sgd_weights * sgd_weights_scale
+        self.sgd_bias = sgd_bias
+        self.sgd_step_size = sgd_step_size
+
+        if use_asgd_scale:
+            asgd_weights[:] *= asgd_scale[:, 0]
+            asgd_weights[:] += (sgd_weights_scale * asgd_scale[:, 1]) * sgd_weights
+            asgd_scale[:, 0] = 1
+            asgd_scale[:, 1] = 0
+
+        self.asgd_weights = asgd_weights
+        self.asgd_bias = asgd_bias
+        self.asgd_step_size = asgd_step_size
+
+        self.n_observations = n_observations
+
+        #profile.print_stats()
+        #print 'n_obs', n_observations, 'time/%i' % len(X), (time.time() - ttt)
+        return self
 
     def cost(self, X, y):
         weights = self.asgd_weights
         bias = self.asgd_bias
-        decisions = dot(X, weights) + bias
+        decisions = dot(np.asarray(X), weights) + bias
         margins = []
         for decision, label in zip(decisions, y):
             dsrt = np.argsort(decision)
-            distractor = dsrt[-2] if dsrt[-1] is label else dsrt[-1]
+            distractor = dsrt[-2] if dsrt[-1] == label else dsrt[-1]
             margin = decision[label] - decision[distractor]
             margins.append(margin)
 
@@ -467,3 +622,4 @@ def cost(self, X, y):
         l2_cost = self.l2_regularization * l2_term
         rval = np.maximum(0, 1 - np.asarray(margins)).mean() + l2_cost
         return rval
+

From 0cc4515e709cd13e945eeef1348fa2498bef48ab Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 20 Mar 2012 16:54:07 -0400
Subject: [PATCH 15/18] adding linsvm

---
 asgd/linsvm.py | 143 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 asgd/linsvm.py

diff --git a/asgd/linsvm.py b/asgd/linsvm.py
new file mode 100644
index 0000000..9e343b9
--- /dev/null
+++ b/asgd/linsvm.py
@@ -0,0 +1,143 @@
+import numpy as np
+
+from .auto_step_size import binary_fit
+from .naive_asgd import NaiveBinaryASGD
+from .naive_asgd import NaiveRankASGD
+from .naive_asgd import SparseUpdateRankASGD
+
+try:
+    import sklearn.svm
+except ImportError:
+    pass
+
+
+class LinearSVM(object):
+    """
+    SVM-fitting object that implements a heuristic for choosing
+    the right solver among several that may be installed in sklearn, and asgd.
+
+    """
+
+    def __init__(self, l2_regularization, solver='auto', label_dct=None,
+            label_weights=None):
+        self.l2_regularization = l2_regularization
+        self.solver = solver
+        self.label_dct = label_dct
+        self.label_weights = label_weights
+
+    def fit(self, X, y, weights=None, bias=None):
+        solver = self.solver
+        label_dct = self.label_dct
+        l2_regularization = self.l2_regularization
+
+        if weights or bias:
+            raise NotImplementedError(
+                    'Currently only train_set = (X, y) is supported')
+        del weights, bias
+        if self.label_weights:
+            raise NotImplementedError()
+
+        n_train, n_feats = X.shape
+        if self.label_dct is None:
+            label_dct = dict([(v, i) for (i, v) in enumerate(sorted(set(y)))])
+        else:
+            label_dct = self.label_dct
+        n_classes = len(label_dct)
+
+        if n_classes < 2:
+            raise ValueError('must be at least 2 labels')
+
+        elif n_classes == 2:
+            if set(y) != set([-1, 1]):
+                # TODO: use the label_dct to automatically adjust
+                raise NotImplementedError()
+
+            if solver == 'auto':
+                if n_feats > n_train:
+                    solver = ('sklearn.svm.SVC', {'kernel': 'precomputed'})
+                else:
+                    solver = ('asgd.NaiveBinaryASGD', {})
+
+            method, method_kwargs = solver
+
+            if method == 'asgd.NaiveBinaryASGD':
+                method_kwargs = dict(method_kwargs)
+                method_kwargs.setdefault('rstate', np.random.RandomState(123))
+                svm = NaiveBinaryASGD(
+                        l2_regularization=l2_regularization,
+                        **method_kwargs)
+                svm = binary_fit(svm, (X, y))
+
+            elif method == 'sklearn.svm.SVC':
+                C = 1.0 / (l2_regularization * len(X))
+                svm = sklearn.svm.SVC(C=C, scale_C=False, **method_kwargs)
+                raise NotImplementedError(
+                    'save ktrn, multiply Xtst by X in predict()')
+                ktrn = linear_kernel(X, X)
+                svm.fit(ktrn, y)
+
+            else:
+                raise ValueError('unrecognized method', method)
+
+        else:  # n_classes > 2
+            if set(y) != set(range(len(set(y)))):
+                # TODO: use the label_dct to automatically adjust
+                raise NotImplementedError('labels need adapting',
+                        set(y))
+            if solver == 'auto':
+                # TODO: switch to OVA and use libSVM?
+                #if n_feats > n_train:
+                #solver = ('asgd.NaiveRankASGD', { })
+                solver = ('asgd.SparseUpdateRankASGD', {})
+
+            method, method_kwargs = solver
+
+            if method == 'asgd.NaiveRankASGD':
+                method_kwargs = dict(method_kwargs)
+                method_kwargs.setdefault('rstate', np.random.RandomState(123))
+                svm = NaiveRankASGD(n_classes, n_feats,
+                        l2_regularization=l2_regularization,
+                        **method_kwargs)
+                svm = binary_fit(svm, (X, y))
+
+            elif method == 'asgd.SparseUpdateRankASGD':
+                method_kwargs = dict(method_kwargs)
+                method_kwargs.setdefault('rstate', np.random.RandomState(123))
+                svm = SparseUpdateRankASGD(n_classes, n_feats,
+                        l2_regularization=l2_regularization,
+                        **method_kwargs)
+                svm = binary_fit(svm, (X, y))
+
+            elif method == 'asgd.NaiveOVAASGD':
+                # -- one vs. all
+                method_kwargs = dict(method_kwargs)
+                method_kwargs.setdefault('rstate', np.random.RandomState(123))
+                svm = NaiveOVAASGD(n_classes, n_feats,
+                        l2_regularization=l2_regularization,
+                        **method_kwargs)
+                svm = binary_fit(svm, (X, y))
+
+            elif method == 'sklearn.svm.SVC':
+                # -- one vs. one
+                raise NotImplementedError(method)
+
+            elif method == 'sklearn.svm.NuSVC':
+                # -- one vs. one
+                raise NotImplementedError(method)
+
+            elif method == 'sklearn.svm.LinearSVC':
+                # -- one vs. all
+                raise NotImplementedError(method)
+
+            else:
+                raise ValueError('unrecognized method', method)
+
+        self.svm = svm
+
+    def predict(self, *args, **kwargs):
+        return self.svm.predict(*args, **kwargs)
+
+    def decision_function(self, *args, **kwargs):
+        return self.svm.decision_function(*args, **kwargs)
+
+

From cd9da187705890ab51062cde0b8dc1501dacf2c2 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 20 Mar 2012 16:58:00 -0400
Subject: [PATCH 16/18] comment for linsvm

---
 asgd/linsvm.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/asgd/linsvm.py b/asgd/linsvm.py
index 9e343b9..8efde54 100644
--- a/asgd/linsvm.py
+++ b/asgd/linsvm.py
@@ -1,3 +1,8 @@
+"""
+Automatic heuristic solver selection: LinearSVM
+
+"""
+
 import numpy as np
 
 from .auto_step_size import binary_fit

From 104bccf4b4e54639b8dbc819bf80bef3b3d05104 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 20 Mar 2012 17:15:23 -0400
Subject: [PATCH 17/18] adding LinearSVM to toplevel

---
 asgd/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/asgd/__init__.py b/asgd/__init__.py
index b9a13ea..4501d95 100644
--- a/asgd/__init__.py
+++ b/asgd/__init__.py
@@ -4,3 +4,5 @@
 from naive_asgd import SparseUpdateRankASGD
 
 from experimental_asgd import ExperimentalBinaryASGD
+
+from linsvm import LinearSVM

From c1e0a3aa16c9391616ba86a1f28e9a20574d70b9 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Thu, 22 Mar 2012 09:27:05 -0400
Subject: [PATCH 18/18] fixing tests and changing early_stop minimization
 method

---
 asgd/auto_step_size.py            | 51 ++++++++++++++++++++-----------
 asgd/linsvm.py                    | 17 +++++++----
 asgd/naive_asgd.py                | 15 +++++++--
 asgd/tests/test_auto_step_size.py | 41 +++++++++----------------
 asgd/tests/test_naive_asgd.py     |  4 +--
 5 files changed, 73 insertions(+), 55 deletions(-)

diff --git a/asgd/auto_step_size.py b/asgd/auto_step_size.py
index f7d804e..15a0dd9 100644
--- a/asgd/auto_step_size.py
+++ b/asgd/auto_step_size.py
@@ -30,7 +30,7 @@ def find_sgd_step_size0(
     -------
         Optimal sgd_step_size0 given `X` and `y`.
     """
-    # -- stupid scipy calls some sizes twice!?
+    # -- stupid solver calls some sizes twice!?
     _cache = {}
 
     def eval_size0(log2_size0):
@@ -52,18 +52,37 @@ def eval_size0(log2_size0):
         rval = other.cost(X, y)
         if np.isnan(rval):
             rval = float('inf')
-        #print 'find step %e: %e' % (current_step_size, rval)
+        logger.info('find step %e: %e' % (current_step_size, rval))
+        _cache[float(log2_size0)] = rval
         return rval
 
-    log2_best_sgd_step_size0 = scipy.optimize.fmin(
-            eval_size0,
-            x0=np.log2(model.sgd_step_size0),
-            xtol=tolerance,
-            )
-
-    #print 'Brent came back with', log2_best_sgd_step_size0, 2 ** log2_best_sgd_step_size0
-
-    rval = 2.0 ** log2_best_sgd_step_size0
+    if tolerance < 0.5:
+        raise NotImplementedError(
+                'tolerance too small, need adaptive stepsize')
+
+    step = -tolerance
+    x0 = np.log2(model.sgd_step_size0)
+    x1 = np.log2(model.sgd_step_size0) + step
+    y0 = eval_size0(x0)
+    y1 = eval_size0(x1)
+    if y1 > y0:
+        step *= -1
+        y0, y1 = y1, y0
+
+    while y1 < y0:
+        x0, y0 = x1, y1
+        x1 = x0 + step
+        y1 = eval_size0(x1)
+
+    # I tried using sp.optimize.fmin, but this function is bumpy and we only
+    # want a really coarse estimate of the optimmum, so fmin and fmin_powell
+    # end up being relatively inefficient even compared to this really stupid
+    # search.
+    #
+    # TODO: increase the stepsize every time it still goes down, and then
+    #       backtrack when we over-step
+
+    rval = 2.0 ** x0
     return rval
 
 
@@ -77,7 +96,7 @@ def binary_fit(
 
     Parameters
     ----------
-    model: BinaryASGD
+    model: BaseASGD instance
         Instance of the model to be fitted.
 
     fit_args - tuple of args to model.fit
@@ -90,12 +109,11 @@ def binary_fit(
 
     Returns
     -------
-    model: BinaryASGD
-        Instances of the model, fitted with an estimate of the best
-        sgd_step_size0
+    model: model, fitted with an estimate of the best sgd_step_size0
     """
 
     assert max_examples > 0
+    logger.info('binary_fit: design matrix shape %s' % str(fit_args[0].shape))
 
     # randomly choose up to max_examples uniformly without replacement from
     # across the whole set of training data.
@@ -111,8 +129,7 @@ def binary_fit(
 
     # Heuristic: take the best stepsize according to the first max_examples,
     # and go half that fast for the full run.
-    stepdown = np.sqrt( float(len(all_idxs)) / float(len(idxs)))
-    step_size0 = max(best / stepdown, step_size_floor)
+    step_size0 = max(best / 2.0, step_size_floor)
 
     logger.info('setting sgd_step_size: %e' % step_size0)
     model.sgd_step_size0 = float(step_size0)
diff --git a/asgd/linsvm.py b/asgd/linsvm.py
index 8efde54..f06c15a 100644
--- a/asgd/linsvm.py
+++ b/asgd/linsvm.py
@@ -90,10 +90,9 @@ def fit(self, X, y, weights=None, bias=None):
                 raise NotImplementedError('labels need adapting',
                         set(y))
             if solver == 'auto':
-                # TODO: switch to OVA and use libSVM?
-                #if n_feats > n_train:
-                #solver = ('asgd.NaiveRankASGD', { })
-                solver = ('asgd.SparseUpdateRankASGD', {})
+                solver = ('asgd.SparseUpdateRankASGD', {
+                        'sgd_step_size0': 10.0 / X.shape[1],
+                        })
 
             method, method_kwargs = solver
 
@@ -131,8 +130,14 @@ def fit(self, X, y, weights=None, bias=None):
                 raise NotImplementedError(method)
 
             elif method == 'sklearn.svm.LinearSVC':
-                # -- one vs. all
-                raise NotImplementedError(method)
+                # -- Crammer & Singer multi-class
+                C = 1.0 / (l2_regularization * len(X))
+                svm = sklearn.svm.LinearSVC(
+                        C=C,
+                        scale_C=False,
+                        multi_class=True,
+                        **method_kwargs)
+                svm.fit(X, y)
 
             else:
                 raise ValueError('unrecognized method', method)
diff --git a/asgd/naive_asgd.py b/asgd/naive_asgd.py
index b73a671..6fd0156 100644
--- a/asgd/naive_asgd.py
+++ b/asgd/naive_asgd.py
@@ -165,7 +165,8 @@ def cost(self, X, y):
         bias = self.asgd_bias
         margin = y * (np.dot(X, weights) + bias)
         l2_cost = self.l2_regularization * (weights ** 2).sum()
-        rval = np.maximum(0, 1 - margin).mean() + l2_cost
+        hinge_loss = np.maximum(0, 1 - margin)
+        rval = hinge_loss.mean() + l2_cost
         return rval
 
 
@@ -311,7 +312,7 @@ def partial_fit(self, X, y):
 
             violations = margin < 1
 
-            if violations:
+            if np.any(violations):
                 label_violated = label[violations]
                 sgd_weights[:, violations] += (
                     sgd_step_size
@@ -475,6 +476,11 @@ class SparseUpdateRankASGD(BaseMultiASGD):
     Implements rank-based multiclass SVM.
     """
 
+    #XXX
+    #XXX
+    # TODO: implement l2-SVM here by updating
+    #       sgd rows in proportion to hinge loss
+
     #@profile
     def partial_fit(self, X, y):
         if set(y) > set(range(self.n_classes)):
@@ -570,7 +576,10 @@ def partial_fit(self, X, y):
             sgd_step_size = sgd_step_size0 / \
                     (sgd_step_size_scheduling ** \
                      sgd_step_size_scheduling_exponent)
-            asgd_step_size = 1. / n_observations
+            if n_observations <= self.fit_n_partial:
+                asgd_step_size = 1.
+            else:
+                asgd_step_size = 1. / (n_observations - self.fit_n_partial)
 
             if len(recent_train_costs) == self.fit_n_partial:
                 if use_asgd_scale:
diff --git a/asgd/tests/test_auto_step_size.py b/asgd/tests/test_auto_step_size.py
index 38e4d1a..1f3c672 100644
--- a/asgd/tests/test_auto_step_size.py
+++ b/asgd/tests/test_auto_step_size.py
@@ -1,4 +1,5 @@
 from nose.tools import assert_equal
+import numpy as np
 from numpy.testing import assert_almost_equal
 from numpy.testing import assert_array_equal
 
@@ -13,7 +14,7 @@
 
 def get_new_model(n_features, rstate, n_points):
     return BinaryASGD(n_features, rstate=rstate,
-            sgd_step_size0=1e3,
+            sgd_step_size0=1e3,  #  intentionally large, binary_fit will fix
             l2_regularization=1e-3,
             max_observations=5 * n_points,
             min_observations=5 * n_points,
@@ -27,13 +28,9 @@ def test_binary_sgd_step_size0():
     X, y = get_fake_data(100, n_features, rstate)
 
     clf = get_new_model(n_features, rstate, 100)
-    best0 = find_sgd_step_size0(clf, (X, y), (.25, .5), tolerance=1e-6)
-
-    # start a little lower, still works
-    best1 = find_sgd_step_size0(clf, (X, y), (.125, .25), tolerance=1e-6)
-    print best0, best1
-    assert_almost_equal(best0, 0.03501, decimal=4)
-    assert_almost_equal(best1, 0.03501, decimal=4)
+    best0 = find_sgd_step_size0(clf, (X, y))
+    print best0
+    assert np.allclose(best0, 0.04, atol=.1, rtol=.5)
 
     # find_sgd_step_size0 does not change clf
     assert clf.sgd_step_size0 == 1000.0
@@ -43,25 +40,15 @@ def test_binary_fit():
     rstate = RandomState(42)
     n_features = 20
 
-    clf100 = get_new_model(n_features, rstate, 100)
-    X, y = get_fake_data(100, n_features, rstate)
-    _clf100 = binary_fit(clf100, (X, y))
-    assert _clf100 is clf100
-    assert_almost_equal(clf100.sgd_step_size0, 0.00954, decimal=4)
-
-    # smoke test
-    clf1000 = get_new_model(n_features, rstate, 1000)
-    X, y = get_fake_data(DEFAULT_MAX_EXAMPLES, n_features, rstate)
-    _clf1000 = binary_fit(clf1000, (X, y))
-    assert _clf1000 is clf1000
-    assert_almost_equal(clf1000.sgd_step_size0, 0.00201, decimal=4)
-
-    # smoke test that at least it runs
-    clf2000 = get_new_model(n_features, rstate, 2000)
-    X, y = get_fake_data(2000, n_features, rstate)
-    _clf2000 = binary_fit(clf2000, (X, y))
-    assert _clf2000 == clf2000
-    assert_almost_equal(clf2000.sgd_step_size0, 0.001498, decimal=4)
+    for L in [100, DEFAULT_MAX_EXAMPLES, int(DEFAULT_MAX_EXAMPLES * 1.5),
+            int(DEFAULT_MAX_EXAMPLES * 3)]:
+
+        clf = get_new_model(n_features, rstate, L)
+        X, y = get_fake_data(L, n_features, rstate, separation=0.1)
+        best = find_sgd_step_size0(clf, (X, y))
+        _clf = binary_fit(clf, (X, y))
+        assert _clf is clf
+        assert 0 < clf.sgd_step_size0 <= best
 
 
 def test_fit_replicable():
diff --git a/asgd/tests/test_naive_asgd.py b/asgd/tests/test_naive_asgd.py
index 668c073..6826373 100644
--- a/asgd/tests/test_naive_asgd.py
+++ b/asgd/tests/test_naive_asgd.py
@@ -22,10 +22,10 @@
                       dtype=np.float32)
 
 
-def get_fake_data(n_points, n_features, rstate):
+def get_fake_data(n_points, n_features, rstate, separation=1e-1):
     X = rstate.randn(n_points, n_features).astype(np.float32)
     y = 2 * (rstate.randn(n_points) > 0) - 1
-    X[y == 1] += 1e-1
+    X[y == 1] += separation
     return X, y