diff --git a/csrank/choicefunction/cmpnet_choice.py b/csrank/choicefunction/cmpnet_choice.py index 9154b6cc..3b9cd590 100644 --- a/csrank/choicefunction/cmpnet_choice.py +++ b/csrank/choicefunction/cmpnet_choice.py @@ -100,13 +100,6 @@ def _convert_instances_(self, X, Y): self.logger.debug("Creating the Dataset") x1, x2, garbage, y_double, garbage = generate_complete_pairwise_dataset(X, Y) del garbage - if x1.shape[0] > self.threshold_instances: - indices = self.random_state.choice( - x1.shape[0], self.threshold_instances, replace=False - ) - x1 = x1[indices, :] - x2 = x2[indices, :] - y_double = y_double[indices, :] self.logger.debug("Finished the Dataset instances {}".format(x1.shape[0])) return x1, x2, y_double diff --git a/csrank/choicefunction/ranknet_choice.py b/csrank/choicefunction/ranknet_choice.py index 036f0dd0..1b49efa2 100644 --- a/csrank/choicefunction/ranknet_choice.py +++ b/csrank/choicefunction/ranknet_choice.py @@ -97,14 +97,6 @@ def _convert_instances_(self, X, Y): self.logger.debug("Creating the Dataset") x1, x2, garbage, garbage, y_single = generate_complete_pairwise_dataset(X, Y) del garbage - if x1.shape[0] > self.threshold_instances: - indices = self.random_state.choice( - x1.shape[0], self.threshold_instances, replace=False - ) - x1 = x1[indices, :] - x2 = x2[indices, :] - y_single = y_single[indices] - self.logger.debug("Sampling instances") self.logger.debug("Finished the Dataset instances {}".format(x1.shape[0])) return x1, x2, y_single diff --git a/csrank/core/cmpnet_core.py b/csrank/core/cmpnet_core.py index 4b80be32..2bcc410f 100644 --- a/csrank/core/cmpnet_core.py +++ b/csrank/core/cmpnet_core.py @@ -54,7 +54,6 @@ def __init__( if key not in allowed_dense_kwargs: del kwargs[key] self.kwargs = kwargs - self.threshold_instances = int(1e10) self.random_state = random_state self.model = None diff --git a/csrank/core/pairwise_svm.py b/csrank/core/pairwise_svm.py index 1974b9d4..09a8d3dd 100644 --- a/csrank/core/pairwise_svm.py +++ b/csrank/core/pairwise_svm.py @@ -16,6 +16,7 @@ def __init__( tol=1e-4, normalize=True, fit_intercept=True, + use_logistic_regression=False, random_state=None, **kwargs, ): @@ -31,6 +32,10 @@ def __init__( If True, the data will be normalized before fitting. fit_intercept : bool, optional If True, the linear model will also fit an intercept. + use_logistic_regression : bool, optional + Whether to fit a Linear Support Vector machine or a Logistic + Regression model. You may want to prefer the simpler Logistic + Regression model on a large sample size. random_state : int, RandomState instance or None, optional Seed of the pseudorandom generator or a RandomState instance **kwargs @@ -44,8 +49,8 @@ def __init__( self.C = C self.tol = tol self.logger = logging.getLogger("RankSVM") + self.use_logistic_regression = use_logistic_regression self.random_state = random_state - self.threshold_instances = int(1e10) self.fit_intercept = fit_intercept self.weights = None self.model = None @@ -68,7 +73,7 @@ def fit(self, X, Y, **kwargs): self.random_state_ = check_random_state(self.random_state) _n_instances, self.n_objects_fit_, self.n_object_features_fit_ = X.shape x_train, y_single = self._convert_instances_(X, Y) - if x_train.shape[0] > self.threshold_instances: + if self.use_logistic_regression: self.model = LogisticRegression( C=self.C, tol=self.tol, diff --git a/csrank/core/ranknet_core.py b/csrank/core/ranknet_core.py index 093f3ac2..f7464091 100644 --- a/csrank/core/ranknet_core.py +++ b/csrank/core/ranknet_core.py @@ -47,7 +47,6 @@ def __init__( if key not in allowed_dense_kwargs: del kwargs[key] self.kwargs = kwargs - self.threshold_instances = int(1e10) self.batch_size = batch_size self._scoring_model = None self.model = None diff --git a/csrank/discretechoice/cmpnet_discrete_choice.py b/csrank/discretechoice/cmpnet_discrete_choice.py index a96e1496..6d22783a 100644 --- a/csrank/discretechoice/cmpnet_discrete_choice.py +++ b/csrank/discretechoice/cmpnet_discrete_choice.py @@ -97,13 +97,6 @@ def _convert_instances_(self, X, Y): self.logger.debug("Creating the Dataset") x1, x2, garbage, y_double, garbage = generate_complete_pairwise_dataset(X, Y) del garbage - if x1.shape[0] > self.threshold_instances: - indices = self.random_state.choice( - x1.shape[0], self.threshold_instances, replace=False - ) - x1 = x1[indices, :] - x2 = x2[indices, :] - y_double = y_double[indices, :] self.logger.debug("Finished the Dataset instances {}".format(x1.shape[0])) return x1, x2, y_double diff --git a/csrank/discretechoice/ranknet_discrete_choice.py b/csrank/discretechoice/ranknet_discrete_choice.py index 797ee0ab..68a782d0 100644 --- a/csrank/discretechoice/ranknet_discrete_choice.py +++ b/csrank/discretechoice/ranknet_discrete_choice.py @@ -96,13 +96,6 @@ def _convert_instances_(self, X, Y): self.logger.debug("Creating the Dataset") x1, x2, garbage, garbage, y_single = generate_complete_pairwise_dataset(X, Y) del garbage - if x1.shape[0] > self.threshold_instances: - indices = self.random_state.choice( - x1.shape[0], self.threshold_instances, replace=False - ) - x1 = x1[indices, :] - x2 = x2[indices, :] - y_single = y_single[indices] self.logger.debug("Finished the Dataset instances {}".format(x1.shape[0])) return x1, x2, y_single diff --git a/csrank/objectranking/cmp_net.py b/csrank/objectranking/cmp_net.py index b3d609f2..87c42c67 100644 --- a/csrank/objectranking/cmp_net.py +++ b/csrank/objectranking/cmp_net.py @@ -104,13 +104,6 @@ def _convert_instances_(self, X, Y): self.logger.debug("Creating the Dataset") garbage, x1, x2, y_double, garbage = generate_complete_pairwise_dataset(X, Y) del garbage - if x1.shape[0] > self.threshold_instances: - indices = self.random_state_.choice( - x1.shape[0], self.threshold_instances, replace=False - ) - x1 = x1[indices, :] - x2 = x2[indices, :] - y_double = y_double[indices, :] self.logger.debug("Finished the Dataset instances {}".format(x1.shape[0])) return x1, x2, y_double diff --git a/csrank/objectranking/list_net.py b/csrank/objectranking/list_net.py index 4d2fd7da..932decad 100644 --- a/csrank/objectranking/list_net.py +++ b/csrank/objectranking/list_net.py @@ -105,7 +105,6 @@ def __init__( del kwargs[key] self.kwargs = kwargs - self.threshold_instances = int(1e10) self.batch_size = batch_size self.random_state = random_state self.hash_file = None diff --git a/csrank/objectranking/rank_net.py b/csrank/objectranking/rank_net.py index 39dd1459..a2f8325d 100644 --- a/csrank/objectranking/rank_net.py +++ b/csrank/objectranking/rank_net.py @@ -98,13 +98,6 @@ def _convert_instances_(self, X, Y): self.logger.debug("Creating the Dataset") garbage, x1, x2, garbage, y_single = generate_complete_pairwise_dataset(X, Y) del garbage - if x1.shape[0] > self.threshold_instances: - indices = self.random_state.choice( - x1.shape[0], self.threshold_instances, replace=False - ) - x1 = x1[indices, :] - x2 = x2[indices, :] - y_single = y_single[indices] self.logger.debug("Finished the Dataset instances {}".format(x1.shape[0])) return x1, x2, y_single