Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 75 additions & 27 deletions csrank/dataset_reader/choicefunctions/choice_data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def make_globular_pareto_choices(
n_objects=10,
seed=42,
cluster_spread=1.0,
cluster_size=10,
**kwargs,
):
def pareto_front(X, signs=None):
Expand All @@ -41,45 +40,94 @@ def pareto_front(X, signs=None):
)
return pareto

def sample_unit_ball(n_inst=10000, n_features=2, rng=None, radius=1.0):
rng = check_random_state(rng)
X = rng.randn(n_inst, n_features)
u = rng.uniform(size=n_inst)[:, None]
X /= np.linalg.norm(X, axis=1, ord=2)[:, None]
X *= radius * u
return X
def sample_from_unit_ball(n_points, dimension, radius, random_state):
"""Sample points uniformly from a ball.

def make_randn_pareto_choices(
n_instances=10000, n_features=2, n_objects=10, data_seed=None, center=0.0
The ball has radius `radius` and is centered at the origin.

Parameters
----------
n_points : int
The number of points to sample.
dimension : int
The dimension of the space.
radius : float
The radius of the ball.
random_state: np.random.RandomState
A numpy random state.

Returns
-------
numpy array of shape (n_points, dimension)
A list of points sampled from the ball.
"""
# Sample a random direction for each point
directions = random_state.randn(n_points, dimension)
# Normalize each direction vector to have length 1 (euclidean
# norm).
directions /= np.linalg.norm(directions, axis=1, ord=2)[:, None]

# Sample a length (as a fraction of the radius) uniformly for each
# point.
u = random_state.uniform(size=n_points)[:, None]
lengths = u * radius

return directions * lengths

def sample_pareto_from_isometric_normal(
n_points, dimension, center, random_state
):
"""Generate random objects from a d-dimensional isometric normal distribution.
"""Generate a Pareto problem from random objects.

Objects are drawn from a d-dimensional isometric normal
distribution.

This should be the easiest possible Pareto-problem, since the model can learn
a latent-utility which scores how likely a point is on the front (independent
of the other points)."""
rand = check_random_state(data_seed)
X = rand.randn(n_instances, n_objects, n_features)
Y = np.empty((n_instances, n_objects), dtype=bool)
for i in range(n_instances):
Y[i] = pareto_front(X[i])
of the other points).

Parameters
----------
n_points : int
The number of points to sample.
dimension : int
The dimension of the space.
center : scalar or numpy array
An offset that will be added to every point.
random_state: np.random.RandomState
A numpy random state.

Returns
-------
X: numpy array of shape (n_points, dimension)
A list of points sampled from the d-dimensional isometric
normal distribution.
Y. numpy array of shape n_points
A binary flag array indicating whether or not the corresponding
point is part of the Pareto front.
"""
X = random_state.randn(n_points, dimension)
Y = pareto_front(X)
return X + center, Y

rand = check_random_state(seed)
X = np.empty((n_instances, n_objects, n_features))
Y = np.empty((n_instances, n_objects), dtype=int)
for i in range(int(n_instances / cluster_size)):
center = sample_unit_ball(
n_inst=1, n_features=n_features, rng=rand, radius=cluster_spread
for i in range(n_instances):
center = sample_from_unit_ball(
n_points=1,
dimension=n_features,
radius=cluster_spread,
random_state=rand,
)
x, y = make_randn_pareto_choices(
n_instances=cluster_size,
n_features=n_features,
n_objects=n_objects,
data_seed=rand,
x, y = sample_pareto_from_isometric_normal(
n_points=n_objects,
dimension=n_features,
center=center,
random_state=rand,
)
X[i * cluster_size : (i + 1) * cluster_size] = x
Y[i * cluster_size : (i + 1) * cluster_size] = y
X[i] = x
Y[i] = y
return X, Y

def make_latent_linear_choices(
Expand Down
25 changes: 25 additions & 0 deletions csrank/tests/test_pareto_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import numpy as np

from csrank import ChoiceDatasetGenerator


def test_pareto_problem_generation():
"""A simple sanity check for Pareto problem generation."""
gen = ChoiceDatasetGenerator(
dataset_type="pareto",
random_state=42,
n_train_instances=11,
n_test_instances=1,
n_objects=3,
n_features=2,
)
X_train, Y_train, X_test, Y_test = gen.get_single_train_test_split()
assert X_train.shape == (11, 3, 2)
assert Y_train.shape == (11, 3)
assert X_test.shape == (1, 3, 2)

def is_binary_array(a):
return np.logical_or(a == 0, a == 1).all()

assert is_binary_array(Y_train)
assert is_binary_array(Y_test)