From d2f8db7fb7ff9a960d9f714d83c14ae29685eb85 Mon Sep 17 00:00:00 2001 From: Timo Kaufmann Date: Thu, 19 Nov 2020 13:23:09 +0100 Subject: [PATCH 1/4] Refactor and document the unit ball sampling I think default values for internal functions just hinder understanding. Changed the parameter names to be less domain specific, since we are just talking about a point in the ball for the purposes of this function. Since this is an internal function, we can require an already initialized random state. Result of this discussion / explanation: https://github.com/kiudee/cs-ranking/pull/164#discussion_r526701277 --- .../choicefunctions/choice_data_generator.py | 47 +++++++++++++++---- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/csrank/dataset_reader/choicefunctions/choice_data_generator.py b/csrank/dataset_reader/choicefunctions/choice_data_generator.py index 57341fcf..ec90c3cd 100644 --- a/csrank/dataset_reader/choicefunctions/choice_data_generator.py +++ b/csrank/dataset_reader/choicefunctions/choice_data_generator.py @@ -41,13 +41,39 @@ def pareto_front(X, signs=None): ) return pareto - def sample_unit_ball(n_inst=10000, n_features=2, rng=None, radius=1.0): - rng = check_random_state(rng) - X = rng.randn(n_inst, n_features) - u = rng.uniform(size=n_inst)[:, None] - X /= np.linalg.norm(X, axis=1, ord=2)[:, None] - X *= radius * u - return X + def sample_from_unit_ball(n_points, dimension, radius, random_state): + """Sample points uniformly from a ball. + + The ball has radius `radius` and is centered at the origin. + + Parameters + ---------- + n_points : int + The number of points to sample. + dimension : int + The dimension of the space. + radius : float + The radius of the ball. + random_state: np.random.RandomState + A numpy random state. + + Returns + ------- + numpy array of shape (n_points, dimension) + A list of points sampled from the ball. + """ + # Sample a random direction for each point + directions = random_state.randn(n_points, dimension) + # Normalize each direction vector to have length 1 (euclidean + # norm). + directions /= np.linalg.norm(directions, axis=1, ord=2)[:, None] + + # Sample a length (as a fraction of the radius) uniformly for each + # point. + u = random_state.uniform(size=n_points)[:, None] + lengths = u * radius + + return directions * lengths def make_randn_pareto_choices( n_instances=10000, n_features=2, n_objects=10, data_seed=None, center=0.0 @@ -68,8 +94,11 @@ def make_randn_pareto_choices( X = np.empty((n_instances, n_objects, n_features)) Y = np.empty((n_instances, n_objects), dtype=int) for i in range(int(n_instances / cluster_size)): - center = sample_unit_ball( - n_inst=1, n_features=n_features, rng=rand, radius=cluster_spread + center = sample_from_unit_ball( + n_points=1, + dimension=n_features, + radius=cluster_spread, + random_state=rand, ) x, y = make_randn_pareto_choices( n_instances=cluster_size, From 1509511f575fdf509cfa187f8f9f879718e0b5e5 Mon Sep 17 00:00:00 2001 From: Timo Kaufmann Date: Thu, 19 Nov 2020 13:31:42 +0100 Subject: [PATCH 2/4] Generate a unique centroid per pareto instance Thereby fixing a bug when the number of instances is not a multiple of 10. Result of this discussion https://github.com/kiudee/cs-ranking/pull/164#discussion_r526232869 --- .../choicefunctions/choice_data_generator.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/csrank/dataset_reader/choicefunctions/choice_data_generator.py b/csrank/dataset_reader/choicefunctions/choice_data_generator.py index ec90c3cd..bab194b5 100644 --- a/csrank/dataset_reader/choicefunctions/choice_data_generator.py +++ b/csrank/dataset_reader/choicefunctions/choice_data_generator.py @@ -27,7 +27,6 @@ def make_globular_pareto_choices( n_objects=10, seed=42, cluster_spread=1.0, - cluster_size=10, **kwargs, ): def pareto_front(X, signs=None): @@ -93,7 +92,7 @@ def make_randn_pareto_choices( rand = check_random_state(seed) X = np.empty((n_instances, n_objects, n_features)) Y = np.empty((n_instances, n_objects), dtype=int) - for i in range(int(n_instances / cluster_size)): + for i in range(n_instances): center = sample_from_unit_ball( n_points=1, dimension=n_features, @@ -101,14 +100,14 @@ def make_randn_pareto_choices( random_state=rand, ) x, y = make_randn_pareto_choices( - n_instances=cluster_size, + n_instances=1, n_features=n_features, n_objects=n_objects, data_seed=rand, center=center, ) - X[i * cluster_size : (i + 1) * cluster_size] = x - Y[i * cluster_size : (i + 1) * cluster_size] = y + X[i] = x + Y[i] = y return X, Y def make_latent_linear_choices( From ee0ccd93f0bcfc55672b7a4a4acfee53ef0efa59 Mon Sep 17 00:00:00 2001 From: Timo Kaufmann Date: Thu, 19 Nov 2020 13:43:15 +0100 Subject: [PATCH 3/4] Clean up and document Pareto sampling Default values for internal functions just add confusion. The parameter names had conflicting meaning, so I switched them to less domain-specific ones. We can assume random states are initialized in internal functions. We only ever need to generate a single instance in this function, so there is no need for multi-instance support. --- .../choicefunctions/choice_data_generator.py | 48 +++++++++++++------ 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/csrank/dataset_reader/choicefunctions/choice_data_generator.py b/csrank/dataset_reader/choicefunctions/choice_data_generator.py index bab194b5..3a8ed071 100644 --- a/csrank/dataset_reader/choicefunctions/choice_data_generator.py +++ b/csrank/dataset_reader/choicefunctions/choice_data_generator.py @@ -74,19 +74,40 @@ def sample_from_unit_ball(n_points, dimension, radius, random_state): return directions * lengths - def make_randn_pareto_choices( - n_instances=10000, n_features=2, n_objects=10, data_seed=None, center=0.0 + def sample_pareto_from_isometric_normal( + n_points, dimension, center, random_state ): - """Generate random objects from a d-dimensional isometric normal distribution. + """Generate a Pareto problem from random objects. + + Objects are drawn from a d-dimensional isometric normal + distribution. This should be the easiest possible Pareto-problem, since the model can learn a latent-utility which scores how likely a point is on the front (independent - of the other points).""" - rand = check_random_state(data_seed) - X = rand.randn(n_instances, n_objects, n_features) - Y = np.empty((n_instances, n_objects), dtype=bool) - for i in range(n_instances): - Y[i] = pareto_front(X[i]) + of the other points). + + Parameters + ---------- + n_points : int + The number of points to sample. + dimension : int + The dimension of the space. + center : scalar or numpy array + An offset that will be added to every point. + random_state: np.random.RandomState + A numpy random state. + + Returns + ------- + X: numpy array of shape (n_points, dimension) + A list of points sampled from the d-dimensional isometric + normal distribution. + Y. numpy array of shape n_points + A binary flag array indicating whether or not the corresponding + point is part of the Pareto front. + """ + X = random_state.randn(n_points, dimension) + Y = pareto_front(X) return X + center, Y rand = check_random_state(seed) @@ -99,12 +120,11 @@ def make_randn_pareto_choices( radius=cluster_spread, random_state=rand, ) - x, y = make_randn_pareto_choices( - n_instances=1, - n_features=n_features, - n_objects=n_objects, - data_seed=rand, + x, y = sample_pareto_from_isometric_normal( + n_points=n_objects, + dimension=n_features, center=center, + random_state=rand, ) X[i] = x Y[i] = y From 02d6578730454889e50ad59dd5d38dae36bf609f Mon Sep 17 00:00:00 2001 From: Timo Kaufmann Date: Thu, 19 Nov 2020 13:53:46 +0100 Subject: [PATCH 4/4] Add test for Pareto dataset generation Just a small sanity check to at least exercise the functionality and verify it does something sensible. --- csrank/tests/test_pareto_dataset.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 csrank/tests/test_pareto_dataset.py diff --git a/csrank/tests/test_pareto_dataset.py b/csrank/tests/test_pareto_dataset.py new file mode 100644 index 00000000..34ee90c5 --- /dev/null +++ b/csrank/tests/test_pareto_dataset.py @@ -0,0 +1,25 @@ +import numpy as np + +from csrank import ChoiceDatasetGenerator + + +def test_pareto_problem_generation(): + """A simple sanity check for Pareto problem generation.""" + gen = ChoiceDatasetGenerator( + dataset_type="pareto", + random_state=42, + n_train_instances=11, + n_test_instances=1, + n_objects=3, + n_features=2, + ) + X_train, Y_train, X_test, Y_test = gen.get_single_train_test_split() + assert X_train.shape == (11, 3, 2) + assert Y_train.shape == (11, 3) + assert X_test.shape == (1, 3, 2) + + def is_binary_array(a): + return np.logical_or(a == 0, a == 1).all() + + assert is_binary_array(Y_train) + assert is_binary_array(Y_test)