From 7879da4854a145adc57af88e74fe537ddaae47d6 Mon Sep 17 00:00:00 2001 From: Timo Kaufmann Date: Sat, 1 Aug 2020 17:56:00 +0200 Subject: [PATCH 1/2] Replace threshold_instances with an explicit flag The threshold was not configurable and the behavior somewhat unexpected. Its better to let the library user decide themselves. --- csrank/core/pairwise_svm.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/csrank/core/pairwise_svm.py b/csrank/core/pairwise_svm.py index 1974b9d4..09a8d3dd 100644 --- a/csrank/core/pairwise_svm.py +++ b/csrank/core/pairwise_svm.py @@ -16,6 +16,7 @@ def __init__( tol=1e-4, normalize=True, fit_intercept=True, + use_logistic_regression=False, random_state=None, **kwargs, ): @@ -31,6 +32,10 @@ def __init__( If True, the data will be normalized before fitting. fit_intercept : bool, optional If True, the linear model will also fit an intercept. + use_logistic_regression : bool, optional + Whether to fit a Linear Support Vector machine or a Logistic + Regression model. You may want to prefer the simpler Logistic + Regression model on a large sample size. random_state : int, RandomState instance or None, optional Seed of the pseudorandom generator or a RandomState instance **kwargs @@ -44,8 +49,8 @@ def __init__( self.C = C self.tol = tol self.logger = logging.getLogger("RankSVM") + self.use_logistic_regression = use_logistic_regression self.random_state = random_state - self.threshold_instances = int(1e10) self.fit_intercept = fit_intercept self.weights = None self.model = None @@ -68,7 +73,7 @@ def fit(self, X, Y, **kwargs): self.random_state_ = check_random_state(self.random_state) _n_instances, self.n_objects_fit_, self.n_object_features_fit_ = X.shape x_train, y_single = self._convert_instances_(X, Y) - if x_train.shape[0] > self.threshold_instances: + if self.use_logistic_regression: self.model = LogisticRegression( C=self.C, tol=self.tol, From 7fcde4b060e210baea129d9d8d39519187b2b79b Mon Sep 17 00:00:00 2001 From: Timo Kaufmann Date: Sat, 1 Aug 2020 18:17:39 +0200 Subject: [PATCH 2/2] Remove instance threshold If the library user wants to reduce the dataset, they can just do so themselves. The current behavior is unexpected, even more so since the threshold is not configurable. --- csrank/choicefunction/cmpnet_choice.py | 7 ------- csrank/choicefunction/ranknet_choice.py | 8 -------- csrank/core/cmpnet_core.py | 1 - csrank/core/ranknet_core.py | 1 - csrank/discretechoice/cmpnet_discrete_choice.py | 7 ------- csrank/discretechoice/ranknet_discrete_choice.py | 7 ------- csrank/objectranking/cmp_net.py | 7 ------- csrank/objectranking/list_net.py | 1 - csrank/objectranking/rank_net.py | 7 ------- 9 files changed, 46 deletions(-) diff --git a/csrank/choicefunction/cmpnet_choice.py b/csrank/choicefunction/cmpnet_choice.py index 9154b6cc..3b9cd590 100644 --- a/csrank/choicefunction/cmpnet_choice.py +++ b/csrank/choicefunction/cmpnet_choice.py @@ -100,13 +100,6 @@ def _convert_instances_(self, X, Y): self.logger.debug("Creating the Dataset") x1, x2, garbage, y_double, garbage = generate_complete_pairwise_dataset(X, Y) del garbage - if x1.shape[0] > self.threshold_instances: - indices = self.random_state.choice( - x1.shape[0], self.threshold_instances, replace=False - ) - x1 = x1[indices, :] - x2 = x2[indices, :] - y_double = y_double[indices, :] self.logger.debug("Finished the Dataset instances {}".format(x1.shape[0])) return x1, x2, y_double diff --git a/csrank/choicefunction/ranknet_choice.py b/csrank/choicefunction/ranknet_choice.py index 036f0dd0..1b49efa2 100644 --- a/csrank/choicefunction/ranknet_choice.py +++ b/csrank/choicefunction/ranknet_choice.py @@ -97,14 +97,6 @@ def _convert_instances_(self, X, Y): self.logger.debug("Creating the Dataset") x1, x2, garbage, garbage, y_single = generate_complete_pairwise_dataset(X, Y) del garbage - if x1.shape[0] > self.threshold_instances: - indices = self.random_state.choice( - x1.shape[0], self.threshold_instances, replace=False - ) - x1 = x1[indices, :] - x2 = x2[indices, :] - y_single = y_single[indices] - self.logger.debug("Sampling instances") self.logger.debug("Finished the Dataset instances {}".format(x1.shape[0])) return x1, x2, y_single diff --git a/csrank/core/cmpnet_core.py b/csrank/core/cmpnet_core.py index 4b80be32..2bcc410f 100644 --- a/csrank/core/cmpnet_core.py +++ b/csrank/core/cmpnet_core.py @@ -54,7 +54,6 @@ def __init__( if key not in allowed_dense_kwargs: del kwargs[key] self.kwargs = kwargs - self.threshold_instances = int(1e10) self.random_state = random_state self.model = None diff --git a/csrank/core/ranknet_core.py b/csrank/core/ranknet_core.py index 093f3ac2..f7464091 100644 --- a/csrank/core/ranknet_core.py +++ b/csrank/core/ranknet_core.py @@ -47,7 +47,6 @@ def __init__( if key not in allowed_dense_kwargs: del kwargs[key] self.kwargs = kwargs - self.threshold_instances = int(1e10) self.batch_size = batch_size self._scoring_model = None self.model = None diff --git a/csrank/discretechoice/cmpnet_discrete_choice.py b/csrank/discretechoice/cmpnet_discrete_choice.py index a96e1496..6d22783a 100644 --- a/csrank/discretechoice/cmpnet_discrete_choice.py +++ b/csrank/discretechoice/cmpnet_discrete_choice.py @@ -97,13 +97,6 @@ def _convert_instances_(self, X, Y): self.logger.debug("Creating the Dataset") x1, x2, garbage, y_double, garbage = generate_complete_pairwise_dataset(X, Y) del garbage - if x1.shape[0] > self.threshold_instances: - indices = self.random_state.choice( - x1.shape[0], self.threshold_instances, replace=False - ) - x1 = x1[indices, :] - x2 = x2[indices, :] - y_double = y_double[indices, :] self.logger.debug("Finished the Dataset instances {}".format(x1.shape[0])) return x1, x2, y_double diff --git a/csrank/discretechoice/ranknet_discrete_choice.py b/csrank/discretechoice/ranknet_discrete_choice.py index 797ee0ab..68a782d0 100644 --- a/csrank/discretechoice/ranknet_discrete_choice.py +++ b/csrank/discretechoice/ranknet_discrete_choice.py @@ -96,13 +96,6 @@ def _convert_instances_(self, X, Y): self.logger.debug("Creating the Dataset") x1, x2, garbage, garbage, y_single = generate_complete_pairwise_dataset(X, Y) del garbage - if x1.shape[0] > self.threshold_instances: - indices = self.random_state.choice( - x1.shape[0], self.threshold_instances, replace=False - ) - x1 = x1[indices, :] - x2 = x2[indices, :] - y_single = y_single[indices] self.logger.debug("Finished the Dataset instances {}".format(x1.shape[0])) return x1, x2, y_single diff --git a/csrank/objectranking/cmp_net.py b/csrank/objectranking/cmp_net.py index b3d609f2..87c42c67 100644 --- a/csrank/objectranking/cmp_net.py +++ b/csrank/objectranking/cmp_net.py @@ -104,13 +104,6 @@ def _convert_instances_(self, X, Y): self.logger.debug("Creating the Dataset") garbage, x1, x2, y_double, garbage = generate_complete_pairwise_dataset(X, Y) del garbage - if x1.shape[0] > self.threshold_instances: - indices = self.random_state_.choice( - x1.shape[0], self.threshold_instances, replace=False - ) - x1 = x1[indices, :] - x2 = x2[indices, :] - y_double = y_double[indices, :] self.logger.debug("Finished the Dataset instances {}".format(x1.shape[0])) return x1, x2, y_double diff --git a/csrank/objectranking/list_net.py b/csrank/objectranking/list_net.py index 4d2fd7da..932decad 100644 --- a/csrank/objectranking/list_net.py +++ b/csrank/objectranking/list_net.py @@ -105,7 +105,6 @@ def __init__( del kwargs[key] self.kwargs = kwargs - self.threshold_instances = int(1e10) self.batch_size = batch_size self.random_state = random_state self.hash_file = None diff --git a/csrank/objectranking/rank_net.py b/csrank/objectranking/rank_net.py index 39dd1459..a2f8325d 100644 --- a/csrank/objectranking/rank_net.py +++ b/csrank/objectranking/rank_net.py @@ -98,13 +98,6 @@ def _convert_instances_(self, X, Y): self.logger.debug("Creating the Dataset") garbage, x1, x2, garbage, y_single = generate_complete_pairwise_dataset(X, Y) del garbage - if x1.shape[0] > self.threshold_instances: - indices = self.random_state.choice( - x1.shape[0], self.threshold_instances, replace=False - ) - x1 = x1[indices, :] - x2 = x2[indices, :] - y_single = y_single[indices] self.logger.debug("Finished the Dataset instances {}".format(x1.shape[0])) return x1, x2, y_single