From d2f8db7fb7ff9a960d9f714d83c14ae29685eb85 Mon Sep 17 00:00:00 2001
From: Timo Kaufmann <timokau@zoho.com>
Date: Thu, 19 Nov 2020 13:23:09 +0100
Subject: [PATCH 1/4] Refactor and document the unit ball sampling

I think default values for internal functions just hinder understanding.
Changed the parameter names to be less domain specific, since we are
just talking about a point in the ball for the purposes of this
function. Since this is an internal function, we can require an already
initialized random state.

Result of this discussion / explanation:
https://github.com/kiudee/cs-ranking/pull/164#discussion_r526701277
---
 .../choicefunctions/choice_data_generator.py  | 47 +++++++++++++++----
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/csrank/dataset_reader/choicefunctions/choice_data_generator.py b/csrank/dataset_reader/choicefunctions/choice_data_generator.py
index 57341fcf..ec90c3cd 100644
--- a/csrank/dataset_reader/choicefunctions/choice_data_generator.py
+++ b/csrank/dataset_reader/choicefunctions/choice_data_generator.py
@@ -41,13 +41,39 @@ def pareto_front(X, signs=None):
                 )
             return pareto
 
-        def sample_unit_ball(n_inst=10000, n_features=2, rng=None, radius=1.0):
-            rng = check_random_state(rng)
-            X = rng.randn(n_inst, n_features)
-            u = rng.uniform(size=n_inst)[:, None]
-            X /= np.linalg.norm(X, axis=1, ord=2)[:, None]
-            X *= radius * u
-            return X
+        def sample_from_unit_ball(n_points, dimension, radius, random_state):
+            """Sample points uniformly from a ball.
+
+            The ball has radius `radius` and is centered at the origin.
+
+            Parameters
+            ----------
+            n_points : int
+                The number of points to sample.
+            dimension : int
+                The dimension of the space.
+            radius : float
+                The radius of the ball.
+            random_state: np.random.RandomState
+                A numpy random state.
+
+            Returns
+            -------
+            numpy array of shape (n_points, dimension)
+                A list of points sampled from the ball.
+            """
+            # Sample a random direction for each point
+            directions = random_state.randn(n_points, dimension)
+            # Normalize each direction vector to have length 1 (euclidean
+            # norm).
+            directions /= np.linalg.norm(directions, axis=1, ord=2)[:, None]
+
+            # Sample a length (as a fraction of the radius) uniformly for each
+            # point.
+            u = random_state.uniform(size=n_points)[:, None]
+            lengths = u * radius
+
+            return directions * lengths
 
         def make_randn_pareto_choices(
             n_instances=10000, n_features=2, n_objects=10, data_seed=None, center=0.0
@@ -68,8 +94,11 @@ def make_randn_pareto_choices(
         X = np.empty((n_instances, n_objects, n_features))
         Y = np.empty((n_instances, n_objects), dtype=int)
         for i in range(int(n_instances / cluster_size)):
-            center = sample_unit_ball(
-                n_inst=1, n_features=n_features, rng=rand, radius=cluster_spread
+            center = sample_from_unit_ball(
+                n_points=1,
+                dimension=n_features,
+                radius=cluster_spread,
+                random_state=rand,
             )
             x, y = make_randn_pareto_choices(
                 n_instances=cluster_size,

From 1509511f575fdf509cfa187f8f9f879718e0b5e5 Mon Sep 17 00:00:00 2001
From: Timo Kaufmann <timokau@zoho.com>
Date: Thu, 19 Nov 2020 13:31:42 +0100
Subject: [PATCH 2/4] Generate a unique centroid per pareto instance

Thereby fixing a bug when the number of instances is not a multiple of
10.

Result of this discussion
https://github.com/kiudee/cs-ranking/pull/164#discussion_r526232869
---
 .../choicefunctions/choice_data_generator.py             | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/csrank/dataset_reader/choicefunctions/choice_data_generator.py b/csrank/dataset_reader/choicefunctions/choice_data_generator.py
index ec90c3cd..bab194b5 100644
--- a/csrank/dataset_reader/choicefunctions/choice_data_generator.py
+++ b/csrank/dataset_reader/choicefunctions/choice_data_generator.py
@@ -27,7 +27,6 @@ def make_globular_pareto_choices(
         n_objects=10,
         seed=42,
         cluster_spread=1.0,
-        cluster_size=10,
         **kwargs,
     ):
         def pareto_front(X, signs=None):
@@ -93,7 +92,7 @@ def make_randn_pareto_choices(
         rand = check_random_state(seed)
         X = np.empty((n_instances, n_objects, n_features))
         Y = np.empty((n_instances, n_objects), dtype=int)
-        for i in range(int(n_instances / cluster_size)):
+        for i in range(n_instances):
             center = sample_from_unit_ball(
                 n_points=1,
                 dimension=n_features,
@@ -101,14 +100,14 @@ def make_randn_pareto_choices(
                 random_state=rand,
             )
             x, y = make_randn_pareto_choices(
-                n_instances=cluster_size,
+                n_instances=1,
                 n_features=n_features,
                 n_objects=n_objects,
                 data_seed=rand,
                 center=center,
             )
-            X[i * cluster_size : (i + 1) * cluster_size] = x
-            Y[i * cluster_size : (i + 1) * cluster_size] = y
+            X[i] = x
+            Y[i] = y
         return X, Y
 
     def make_latent_linear_choices(

From ee0ccd93f0bcfc55672b7a4a4acfee53ef0efa59 Mon Sep 17 00:00:00 2001
From: Timo Kaufmann <timokau@zoho.com>
Date: Thu, 19 Nov 2020 13:43:15 +0100
Subject: [PATCH 3/4] Clean up and document Pareto sampling

Default values for internal functions just add confusion. The parameter
names had conflicting meaning, so I switched them to less
domain-specific ones. We can assume random states are initialized in
internal functions. We only ever need to generate a single instance in
this function, so there is no need for multi-instance support.
---
 .../choicefunctions/choice_data_generator.py  | 48 +++++++++++++------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/csrank/dataset_reader/choicefunctions/choice_data_generator.py b/csrank/dataset_reader/choicefunctions/choice_data_generator.py
index bab194b5..3a8ed071 100644
--- a/csrank/dataset_reader/choicefunctions/choice_data_generator.py
+++ b/csrank/dataset_reader/choicefunctions/choice_data_generator.py
@@ -74,19 +74,40 @@ def sample_from_unit_ball(n_points, dimension, radius, random_state):
 
             return directions * lengths
 
-        def make_randn_pareto_choices(
-            n_instances=10000, n_features=2, n_objects=10, data_seed=None, center=0.0
+        def sample_pareto_from_isometric_normal(
+            n_points, dimension, center, random_state
         ):
-            """Generate random objects from a d-dimensional isometric normal distribution.
+            """Generate a Pareto problem from random objects.
+
+            Objects are drawn from a d-dimensional isometric normal
+            distribution.
 
             This should be the easiest possible Pareto-problem, since the model can learn
             a latent-utility which scores how likely a point is on the front (independent
-            of the other points)."""
-            rand = check_random_state(data_seed)
-            X = rand.randn(n_instances, n_objects, n_features)
-            Y = np.empty((n_instances, n_objects), dtype=bool)
-            for i in range(n_instances):
-                Y[i] = pareto_front(X[i])
+            of the other points).
+
+            Parameters
+            ----------
+            n_points : int
+                The number of points to sample.
+            dimension : int
+                The dimension of the space.
+            center : scalar or numpy array
+                An offset that will be added to every point.
+            random_state: np.random.RandomState
+                A numpy random state.
+
+            Returns
+            -------
+            X: numpy array of shape (n_points, dimension)
+                A list of points sampled from the d-dimensional isometric
+                normal distribution.
+            Y. numpy array of shape n_points
+                A binary flag array indicating whether or not the corresponding
+                point is part of the Pareto front.
+            """
+            X = random_state.randn(n_points, dimension)
+            Y = pareto_front(X)
             return X + center, Y
 
         rand = check_random_state(seed)
@@ -99,12 +120,11 @@ def make_randn_pareto_choices(
                 radius=cluster_spread,
                 random_state=rand,
             )
-            x, y = make_randn_pareto_choices(
-                n_instances=1,
-                n_features=n_features,
-                n_objects=n_objects,
-                data_seed=rand,
+            x, y = sample_pareto_from_isometric_normal(
+                n_points=n_objects,
+                dimension=n_features,
                 center=center,
+                random_state=rand,
             )
             X[i] = x
             Y[i] = y

From 02d6578730454889e50ad59dd5d38dae36bf609f Mon Sep 17 00:00:00 2001
From: Timo Kaufmann <timokau@zoho.com>
Date: Thu, 19 Nov 2020 13:53:46 +0100
Subject: [PATCH 4/4] Add test for Pareto dataset generation

Just a small sanity check to at least exercise the functionality and
verify it does something sensible.
---
 csrank/tests/test_pareto_dataset.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 csrank/tests/test_pareto_dataset.py

diff --git a/csrank/tests/test_pareto_dataset.py b/csrank/tests/test_pareto_dataset.py
new file mode 100644
index 00000000..34ee90c5
--- /dev/null
+++ b/csrank/tests/test_pareto_dataset.py
@@ -0,0 +1,25 @@
+import numpy as np
+
+from csrank import ChoiceDatasetGenerator
+
+
+def test_pareto_problem_generation():
+    """A simple sanity check for Pareto problem generation."""
+    gen = ChoiceDatasetGenerator(
+        dataset_type="pareto",
+        random_state=42,
+        n_train_instances=11,
+        n_test_instances=1,
+        n_objects=3,
+        n_features=2,
+    )
+    X_train, Y_train, X_test, Y_test = gen.get_single_train_test_split()
+    assert X_train.shape == (11, 3, 2)
+    assert Y_train.shape == (11, 3)
+    assert X_test.shape == (1, 3, 2)
+
+    def is_binary_array(a):
+        return np.logical_or(a == 0, a == 1).all()
+
+    assert is_binary_array(Y_train)
+    assert is_binary_array(Y_test)