From 727d61cf86925908615a5af4520d4b2a960daac1 Mon Sep 17 00:00:00 2001 From: srinikrish22 Date: Mon, 27 Jan 2020 00:29:19 +0100 Subject: [PATCH 01/11] Condensed commits for all changes in new branch --- csrank/metrics.py | 39 ++- csrank/objectranking/lambdamart.py | 376 +++++++++++++++++++++++++++++ csrank/tests/test_ranking.py | 2 + 3 files changed, 416 insertions(+), 1 deletion(-) create mode 100644 csrank/objectranking/lambdamart.py diff --git a/csrank/metrics.py b/csrank/metrics.py index eb554449..bc2e338d 100644 --- a/csrank/metrics.py +++ b/csrank/metrics.py @@ -47,6 +47,7 @@ import numpy as np import tensorflow as tf from keras import backend as K +import math from csrank.tensorflow_util import scores_to_rankings, get_instances_objects, tensorify @@ -54,7 +55,8 @@ 'zero_one_rank_loss_for_scores_ties', 'make_ndcg_at_k_loss', 'kendalls_tau_for_scores', 'spearman_correlation_for_scores', "zero_one_accuracy", - "zero_one_accuracy_for_scores", "topk_categorical_accuracy"] + "zero_one_accuracy_for_scores", "topk_categorical_accuracy", + "point_dcg", "dcg", "ndcg"] def zero_one_rank_loss(y_true, y_pred): @@ -331,3 +333,38 @@ def err(y_true, y_pred, utility_function=None, probability_mapping=None): results = tf.reduce_sum(discounted_document_values, axis=1) return K.mean(results) + +def point_dcg(args): + """ + Point DCG calculation function. Calculates the DCG for a given list. This list is assumed to be consisting of the rankings of documents belonging to the same query + """ + pos, label = args + return (2 ** label - 1) / math.log(pos + 2, 2) + +def dcg(preds): + """ + List DCG calculation function. This function turns the list of rankings into a form which is easier to be passed to the point DCG function + """ + return sum(map(point_dcg, enumerate(preds))) + +def ndcg(preds, k=10): + """ + NDCG calculation function that calculates the NDCG values with the help of the DCG calculation helper functions. + """ + ideal_top = preds[:k] + + true_top = np.array([]) + if len(preds) > 10: + true_top = np.partition(preds, -10)[-k:] + true_top.sort() + else: + true_top = np.sort(preds) + true_top = true_top[::-1] + + max_dcg = dcg(true_top) + ideal_dcg = dcg(ideal_top) + + if max_dcg == 0: + return 1 + + return ideal_dcg / max_dcg \ No newline at end of file diff --git a/csrank/objectranking/lambdamart.py b/csrank/objectranking/lambdamart.py new file mode 100644 index 00000000..94b7cb4f --- /dev/null +++ b/csrank/objectranking/lambdamart.py @@ -0,0 +1,376 @@ +import logging, math +from collections import deque +from multiprocessing import Pool +from itertools import chain + +import numpy as np +from sklearn.tree import DecisionTreeRegressor + +from csrank.learner import Learner +from csrank.metrics import point_dcg, dcg, ndcg +from csrank.objectranking.object_ranker import ObjectRanker + +class LambdaMART(ObjectRanker,Learner): + def __init__(self, n_objects=None, n_object_features=None, number_of_trees=5, learning_rate=1e-3, + min_samples_split=2, max_depth=50, min_samples_leaf=1, max_leaf_nodes=None, num_process = None, + criterion="mse", splitter="best", min_weight_fraction_leaf=None, max_features=None, random_state=None, + min_impurity_decrease=None, min_impurity_split=None, **kwargs): + """ + Create a LambdaMART based rank regression model. This model uses an ensemble of trees that learn to predict + the relevance scores of the documents based on the features, which then can be turned into rankings. + The base learner used is the implementation of Decision Tree from the sklearn tree package. The learner + tries to indirectly optimize the nDCG metric by learning the lambdas. + + Parameters + ---------- + n_object_features : int + Number of features of the object space + n_objects : int + Number of objects + number_of_trees : int + The maximum number of trees that are to be trained for the ensemble. + learning_rate : float + learning rate for the LambdaMART algorithm + min_samples_split : int + Number of samples required to split the internal node + max_depth : int + Maximum depth of the tree + min_samples_leaf : int + Minimum number of samples required to be at the leaf node + + References + ---------- + [1] Burges, Chris J.C. (2010, June). "From RankNet to LambdaRank to LambdaMART: An Overview" + """ + self.n_object_features = n_object_features + self.n_objects = n_objects + self.number_of_trees = number_of_trees + self.learning_rate = learning_rate + self.min_samples_split = min_samples_split + self.max_depth = max_depth + self.min_samples_leaf = min_samples_leaf + self.max_leaf_nodes = max_leaf_nodes + self.num_process = num_process + self.ensemble = [] + self.random_state = random_state + self.criterion = criterion + self.splitter = splitter + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.random_state = random_state + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + self.logger = logging.getLogger(LambdaMART.__name__) + + def _prepare_train_data(self, X, Y, **kwargs): + """ + Transform the data provided in the form of X_train of shape (n_instances,n_objects,n_features) and y_train of shape (n_instances,n_documents) into (n_instances*n_objects,n_features). The output format is similar to the oneprovided by the cusrom dataset reader. + + Parameters + --------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Rankings of the given objects + Returns + ------ + Returns an array of shape (n_instances*n_objects,n_features) with the features and relevance scores derived from the ranking provided in y_train + + """ + #prepare array like features and imaginary qids + xdim = X.shape[0] # n_instances - qid + ydim = X.shape[1] # n_objects - documents + zdim = X.shape[2] # n_features + + features_as_list = deque() + for i in range(0,xdim): + for j in range(0,ydim): + row_as_list=deque([i]) + features = deque() + for k in range(0, zdim): + row_as_list.append(X[i, j, k]) + features_as_list.append(row_as_list) + + #Convert rankings to relevance scores + scores_docsize = Y.shape[1] + relscore_train = np.subtract(scores_docsize, Y) + + #prepare array like relevance score values + xdim_scores = relscore_train.shape[0] + ydim_scores = relscore_train.shape[1] + + scores_as_list = deque() + for x in range(0,xdim_scores): + for y in range(0,ydim_scores): + scores_as_list.append(relscore_train[x,y]) + + #Check if both the dimensions are the same + assert(len(features_as_list)==len(scores_as_list)) + + #convert to numpy and resize the arrays + features = np.asarray(features_as_list) + scores_unflat = np.array(scores_as_list) + scores = np.reshape(scores_unflat,(len(scores_unflat),1)) + + #Concatenate the reshaped arrays and return as trainin data + train_data = np.concatenate((scores,features),axis=1) + + return train_data + + def _group_by_queries(self, data, queries): + + """ + Internal function which orders the data given as input based on the queries supplied. + """ + result = [] + curr_query = None + for s, q in zip(data, queries): + if q != curr_query: + result.append([]) + curr_query = q + result[-1].append(s) + result = list(map(np.array, result)) + return result + + def fit(self, X, y, **kwargs): + """ + Fit a LambdaMART algorithm to the provided X and y arrays where X contains the features and y being the relevance scores. + + Parameters + ---------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Rankings of the given objects + **kwargs + Keyword arguments for the fit function + + Returns + ------- + Returns the model which is in turn just a list of all the trees that make up the MART model + + """ + #check the case if the ensemble already has some trees then clear the trees so that the trees from the previous iteration are not used. + if len(self.ensemble) > 0: + self.ensemble.clear() + + train_file = self._prepare_train_data(X, y) + scores = train_file[:, 0] + queries = train_file[:, 1] + features = train_file[:, 3:] + + model_preds = np.zeros(len(features)) + + for i in range(self.number_of_trees): + #print(" Iteration: " + str(i + 1)) + true_data = self._group_by_queries(scores, queries) + model_data = self._group_by_queries(model_preds, queries) + + with Pool(self.num_process) as pool: + lambdas_draft = pool.map(query_lambdas, list(zip(true_data, model_data))) + lambdas = list(chain(*lambdas_draft)) + + tree = DecisionTreeRegressor(criterion=self.criterion, + splitter=self.splitter, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_features=None, + random_state=self.random_state, + max_leaf_nodes=self.max_leaf_nodes, + min_impurity_decrease=self.min_impurity_decrease, + min_impurity_split=self.min_impurity_split) + tree.fit(features, lambdas) + + self.ensemble.append(tree) + + prediction = tree.predict(features) + model_preds += self.learning_rate * prediction + #TODO: Remove the next two statements after debugging + train_score = self._score(model_preds, scores, queries, 10) + print(" --iteration train score " + str(train_score)) + return self.ensemble + + def _predict_scores_fixed(self, X, **kwargs): + """ + Predict the scores for a given collection of sets of objects of same size. + + Parameters + ---------- + X : array-like, shape (n_samples, n_objects, n_features) + + + Returns + ------- + Y : array-like, shape (n_samples, n_objects) + Returns the scores of each of the objects for each of the samples. + """ + n_instances, n_objects, n_features = X.shape + self.logger.info("For Test instances {} objects {} features {}".format(*X.shape)) + X1 = X.reshape(n_instances * n_objects, n_features) + scores = np.zeros(n_instances * n_objects) + for tree in self.ensemble: + scores += tree.predict(X1) + scores = scores.reshape(n_instances, n_objects) + return scores + + def predict_scores(self, X, **kwargs): + """ + Predict the utility scores for each object in the collection of set of objects called a query set. + + Parameters + ---------- + X : numpy array of size (n_instances, n_objects, n_features) + + Returns + ------- + Numpy array of size (n_instances, n_objects) + """ + return super().predict_scores(X, **kwargs) + + def predict_for_scores(self, scores, **kwargs): + """ + Predict rankings for the scores for a given collection of sets of objects (query sets). Wrapper that calls the function of the same name + belonging to the ObjectRanker super class. + """ + return ObjectRanker.predict_for_scores(self, scores, **kwargs) + + def predict(self, X, **kwargs): + return super().predict(X, **kwargs) + + def _predict(self, pred_vector): + """ + Predict the scores for the data supplied by iterating over the ensemble and returning the output. + + Parameters + ---------- + pred_vector: this is a numpy array of shape (n_objects,n_features) + + Returns + ------- + results: Predicted scores for each of the objects + queries: queries corresponding to the predictions that are made + """ + queries = pred_vector[:, 1] + features = pred_vector[:, 2:] + + results = np.zeros(len(features)) + for tree in self.ensemble: + results += tree.predict(features) * self.learning_rate + return results, queries + + def _score(self, prediction, true_score, query, k=10): + """ + Function that is used to score the performance of the model. + + Parameters + ---------- + prediction: Predictions of the model + true_score: ground truth data of the predictions + query: queries accompanying the prediction data used to calculate the ndcg value + + Returns + ------- + Returns the average NDCG value calculated on the basis of the queries supplied, for the predictions + """ + true_data = self._group_by_queries(true_score, query) + model_data = self._group_by_queries(prediction, query) + + total_ndcg = [] + + for true_d, model_d in zip(true_data, model_data): + data = true_d[np.argsort(model_d)[::-1]] + total_ndcg.append(ndcg(data, k)) + + return sum(total_ndcg) / len(total_ndcg) + + def set_tunable_parameters(self, min_samples_split, max_depth, min_samples_leaf, max_leaf_nodes, + learning_rate, number_of_trees, criterion, splitter, min_weight_fraction_leaf, + max_features, random_state, min_impurity_decrease, min_impurity_split, **kwargs): + """ + Set the tunable hyperparameters of the DecisionTree model used in LambdaMART + + Parameters + ---------- + min_samples_split : int + Number of samples required to split the internal node + max_depth : int + Maximum depth of the tree + min_samples_leaf : int + Minimum number of samples required to be at the leaf node + max_leaf_nodes : int + These are the maximum number of leaf nodes used to grow the tree + number_of_trees : int + The maximum number of trees that are to be trained for the ensemble. + learning_rate : float + learning rate for the LambdaMART algorithm + """ + self.min_samples_split = min_samples_split + self.max_depth = max_depth + self.min_samples_leaf = min_samples_leaf + self.max_leaf_nodes = max_leaf_nodes + self.number_of_trees = number_of_trees + self.learning_rate = learning_rate + self.criterion = criterion + self.splitter = splitter + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.random_state = random_state + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + + +def query_lambdas(data, k=10): + """ + This is used by the LambdaMART learner to compute the lambda values that are to be used as the target variable for the learner. + + Parameters + ---------- + data : This contains the training data and the predictions from the previous iteration of the learning loop to calculate the lambda values + + Returns + ------- + Returns the lambda values calculated for the current iteration + """ + true_data, model_data = data + worst_order = np.argsort(true_data) + + true_data = true_data[worst_order] + model_data = model_data[worst_order] + + + model_order = np.argsort(model_data) + + idcg = dcg(np.sort(true_data)[-10:][::-1]) + + size = len(true_data) + position_score = np.zeros((size, size)) + + for i in range(size): + for j in range(size): + position_score[model_order[i], model_order[j]] = \ + point_dcg((model_order[j], true_data[model_order[i]])) + + lambdas = np.zeros(size) + + for i in range(size): + for j in range(size): + if true_data[i] > true_data[j]: + + delta_dcg = position_score[i][j] - position_score[i][i] + delta_dcg += position_score[j][i] - position_score[j][j] + + delta_ndcg = abs(delta_dcg / idcg) + + rho = 1 / (1 + math.exp(model_data[i] - model_data[j])) + + lam = rho * delta_ndcg + + lambdas[j] -= lam + lambdas[i] += lam + return lambdas \ No newline at end of file diff --git a/csrank/tests/test_ranking.py b/csrank/tests/test_ranking.py index 22fe72df..e1d8f7fd 100644 --- a/csrank/tests/test_ranking.py +++ b/csrank/tests/test_ranking.py @@ -16,6 +16,8 @@ object_rankers = { FATELINEAR_RANKER: (FATELinearObjectRanker, {"n_hidden_set_units": 12, "batch_size": 1}, (0.0, 1.0)), FETALINEAR_RANKER: (FETALinearObjectRanker, {}, (0.0, 1.0)), + LAMBDAMART: (LambdaMART, {"min_samples_split": 2, "max_depth": 50, "min_samples_leaf": 1, + "max_leaf_nodes": 10}, (0.66, 0.0)), FETA_RANKER: (FETAObjectRanker, {"add_zeroth_order_model": True, "optimizer": optimizer}, (0.0, 1.0)), RANKNET: (RankNet, {"optimizer": optimizer}, (0.0, 1.0)), CMPNET: (CmpNet, {"optimizer": optimizer}, (0.0, 1.0)), From 94da9f76975051000748e1fb4c8185bce81855f3 Mon Sep 17 00:00:00 2001 From: srinikrish22 Date: Mon, 27 Jan 2020 00:29:19 +0100 Subject: [PATCH 02/11] Condensed commits for all changes in new branch --- csrank/metrics.py | 39 ++- csrank/objectranking/lambdamart.py | 376 +++++++++++++++++++++++++++++ csrank/tests/test_ranking.py | 2 + 3 files changed, 416 insertions(+), 1 deletion(-) create mode 100644 csrank/objectranking/lambdamart.py diff --git a/csrank/metrics.py b/csrank/metrics.py index eb554449..bc2e338d 100644 --- a/csrank/metrics.py +++ b/csrank/metrics.py @@ -47,6 +47,7 @@ import numpy as np import tensorflow as tf from keras import backend as K +import math from csrank.tensorflow_util import scores_to_rankings, get_instances_objects, tensorify @@ -54,7 +55,8 @@ 'zero_one_rank_loss_for_scores_ties', 'make_ndcg_at_k_loss', 'kendalls_tau_for_scores', 'spearman_correlation_for_scores', "zero_one_accuracy", - "zero_one_accuracy_for_scores", "topk_categorical_accuracy"] + "zero_one_accuracy_for_scores", "topk_categorical_accuracy", + "point_dcg", "dcg", "ndcg"] def zero_one_rank_loss(y_true, y_pred): @@ -331,3 +333,38 @@ def err(y_true, y_pred, utility_function=None, probability_mapping=None): results = tf.reduce_sum(discounted_document_values, axis=1) return K.mean(results) + +def point_dcg(args): + """ + Point DCG calculation function. Calculates the DCG for a given list. This list is assumed to be consisting of the rankings of documents belonging to the same query + """ + pos, label = args + return (2 ** label - 1) / math.log(pos + 2, 2) + +def dcg(preds): + """ + List DCG calculation function. This function turns the list of rankings into a form which is easier to be passed to the point DCG function + """ + return sum(map(point_dcg, enumerate(preds))) + +def ndcg(preds, k=10): + """ + NDCG calculation function that calculates the NDCG values with the help of the DCG calculation helper functions. + """ + ideal_top = preds[:k] + + true_top = np.array([]) + if len(preds) > 10: + true_top = np.partition(preds, -10)[-k:] + true_top.sort() + else: + true_top = np.sort(preds) + true_top = true_top[::-1] + + max_dcg = dcg(true_top) + ideal_dcg = dcg(ideal_top) + + if max_dcg == 0: + return 1 + + return ideal_dcg / max_dcg \ No newline at end of file diff --git a/csrank/objectranking/lambdamart.py b/csrank/objectranking/lambdamart.py new file mode 100644 index 00000000..94b7cb4f --- /dev/null +++ b/csrank/objectranking/lambdamart.py @@ -0,0 +1,376 @@ +import logging, math +from collections import deque +from multiprocessing import Pool +from itertools import chain + +import numpy as np +from sklearn.tree import DecisionTreeRegressor + +from csrank.learner import Learner +from csrank.metrics import point_dcg, dcg, ndcg +from csrank.objectranking.object_ranker import ObjectRanker + +class LambdaMART(ObjectRanker,Learner): + def __init__(self, n_objects=None, n_object_features=None, number_of_trees=5, learning_rate=1e-3, + min_samples_split=2, max_depth=50, min_samples_leaf=1, max_leaf_nodes=None, num_process = None, + criterion="mse", splitter="best", min_weight_fraction_leaf=None, max_features=None, random_state=None, + min_impurity_decrease=None, min_impurity_split=None, **kwargs): + """ + Create a LambdaMART based rank regression model. This model uses an ensemble of trees that learn to predict + the relevance scores of the documents based on the features, which then can be turned into rankings. + The base learner used is the implementation of Decision Tree from the sklearn tree package. The learner + tries to indirectly optimize the nDCG metric by learning the lambdas. + + Parameters + ---------- + n_object_features : int + Number of features of the object space + n_objects : int + Number of objects + number_of_trees : int + The maximum number of trees that are to be trained for the ensemble. + learning_rate : float + learning rate for the LambdaMART algorithm + min_samples_split : int + Number of samples required to split the internal node + max_depth : int + Maximum depth of the tree + min_samples_leaf : int + Minimum number of samples required to be at the leaf node + + References + ---------- + [1] Burges, Chris J.C. (2010, June). "From RankNet to LambdaRank to LambdaMART: An Overview" + """ + self.n_object_features = n_object_features + self.n_objects = n_objects + self.number_of_trees = number_of_trees + self.learning_rate = learning_rate + self.min_samples_split = min_samples_split + self.max_depth = max_depth + self.min_samples_leaf = min_samples_leaf + self.max_leaf_nodes = max_leaf_nodes + self.num_process = num_process + self.ensemble = [] + self.random_state = random_state + self.criterion = criterion + self.splitter = splitter + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.random_state = random_state + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + self.logger = logging.getLogger(LambdaMART.__name__) + + def _prepare_train_data(self, X, Y, **kwargs): + """ + Transform the data provided in the form of X_train of shape (n_instances,n_objects,n_features) and y_train of shape (n_instances,n_documents) into (n_instances*n_objects,n_features). The output format is similar to the oneprovided by the cusrom dataset reader. + + Parameters + --------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Rankings of the given objects + Returns + ------ + Returns an array of shape (n_instances*n_objects,n_features) with the features and relevance scores derived from the ranking provided in y_train + + """ + #prepare array like features and imaginary qids + xdim = X.shape[0] # n_instances - qid + ydim = X.shape[1] # n_objects - documents + zdim = X.shape[2] # n_features + + features_as_list = deque() + for i in range(0,xdim): + for j in range(0,ydim): + row_as_list=deque([i]) + features = deque() + for k in range(0, zdim): + row_as_list.append(X[i, j, k]) + features_as_list.append(row_as_list) + + #Convert rankings to relevance scores + scores_docsize = Y.shape[1] + relscore_train = np.subtract(scores_docsize, Y) + + #prepare array like relevance score values + xdim_scores = relscore_train.shape[0] + ydim_scores = relscore_train.shape[1] + + scores_as_list = deque() + for x in range(0,xdim_scores): + for y in range(0,ydim_scores): + scores_as_list.append(relscore_train[x,y]) + + #Check if both the dimensions are the same + assert(len(features_as_list)==len(scores_as_list)) + + #convert to numpy and resize the arrays + features = np.asarray(features_as_list) + scores_unflat = np.array(scores_as_list) + scores = np.reshape(scores_unflat,(len(scores_unflat),1)) + + #Concatenate the reshaped arrays and return as trainin data + train_data = np.concatenate((scores,features),axis=1) + + return train_data + + def _group_by_queries(self, data, queries): + + """ + Internal function which orders the data given as input based on the queries supplied. + """ + result = [] + curr_query = None + for s, q in zip(data, queries): + if q != curr_query: + result.append([]) + curr_query = q + result[-1].append(s) + result = list(map(np.array, result)) + return result + + def fit(self, X, y, **kwargs): + """ + Fit a LambdaMART algorithm to the provided X and y arrays where X contains the features and y being the relevance scores. + + Parameters + ---------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Rankings of the given objects + **kwargs + Keyword arguments for the fit function + + Returns + ------- + Returns the model which is in turn just a list of all the trees that make up the MART model + + """ + #check the case if the ensemble already has some trees then clear the trees so that the trees from the previous iteration are not used. + if len(self.ensemble) > 0: + self.ensemble.clear() + + train_file = self._prepare_train_data(X, y) + scores = train_file[:, 0] + queries = train_file[:, 1] + features = train_file[:, 3:] + + model_preds = np.zeros(len(features)) + + for i in range(self.number_of_trees): + #print(" Iteration: " + str(i + 1)) + true_data = self._group_by_queries(scores, queries) + model_data = self._group_by_queries(model_preds, queries) + + with Pool(self.num_process) as pool: + lambdas_draft = pool.map(query_lambdas, list(zip(true_data, model_data))) + lambdas = list(chain(*lambdas_draft)) + + tree = DecisionTreeRegressor(criterion=self.criterion, + splitter=self.splitter, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_features=None, + random_state=self.random_state, + max_leaf_nodes=self.max_leaf_nodes, + min_impurity_decrease=self.min_impurity_decrease, + min_impurity_split=self.min_impurity_split) + tree.fit(features, lambdas) + + self.ensemble.append(tree) + + prediction = tree.predict(features) + model_preds += self.learning_rate * prediction + #TODO: Remove the next two statements after debugging + train_score = self._score(model_preds, scores, queries, 10) + print(" --iteration train score " + str(train_score)) + return self.ensemble + + def _predict_scores_fixed(self, X, **kwargs): + """ + Predict the scores for a given collection of sets of objects of same size. + + Parameters + ---------- + X : array-like, shape (n_samples, n_objects, n_features) + + + Returns + ------- + Y : array-like, shape (n_samples, n_objects) + Returns the scores of each of the objects for each of the samples. + """ + n_instances, n_objects, n_features = X.shape + self.logger.info("For Test instances {} objects {} features {}".format(*X.shape)) + X1 = X.reshape(n_instances * n_objects, n_features) + scores = np.zeros(n_instances * n_objects) + for tree in self.ensemble: + scores += tree.predict(X1) + scores = scores.reshape(n_instances, n_objects) + return scores + + def predict_scores(self, X, **kwargs): + """ + Predict the utility scores for each object in the collection of set of objects called a query set. + + Parameters + ---------- + X : numpy array of size (n_instances, n_objects, n_features) + + Returns + ------- + Numpy array of size (n_instances, n_objects) + """ + return super().predict_scores(X, **kwargs) + + def predict_for_scores(self, scores, **kwargs): + """ + Predict rankings for the scores for a given collection of sets of objects (query sets). Wrapper that calls the function of the same name + belonging to the ObjectRanker super class. + """ + return ObjectRanker.predict_for_scores(self, scores, **kwargs) + + def predict(self, X, **kwargs): + return super().predict(X, **kwargs) + + def _predict(self, pred_vector): + """ + Predict the scores for the data supplied by iterating over the ensemble and returning the output. + + Parameters + ---------- + pred_vector: this is a numpy array of shape (n_objects,n_features) + + Returns + ------- + results: Predicted scores for each of the objects + queries: queries corresponding to the predictions that are made + """ + queries = pred_vector[:, 1] + features = pred_vector[:, 2:] + + results = np.zeros(len(features)) + for tree in self.ensemble: + results += tree.predict(features) * self.learning_rate + return results, queries + + def _score(self, prediction, true_score, query, k=10): + """ + Function that is used to score the performance of the model. + + Parameters + ---------- + prediction: Predictions of the model + true_score: ground truth data of the predictions + query: queries accompanying the prediction data used to calculate the ndcg value + + Returns + ------- + Returns the average NDCG value calculated on the basis of the queries supplied, for the predictions + """ + true_data = self._group_by_queries(true_score, query) + model_data = self._group_by_queries(prediction, query) + + total_ndcg = [] + + for true_d, model_d in zip(true_data, model_data): + data = true_d[np.argsort(model_d)[::-1]] + total_ndcg.append(ndcg(data, k)) + + return sum(total_ndcg) / len(total_ndcg) + + def set_tunable_parameters(self, min_samples_split, max_depth, min_samples_leaf, max_leaf_nodes, + learning_rate, number_of_trees, criterion, splitter, min_weight_fraction_leaf, + max_features, random_state, min_impurity_decrease, min_impurity_split, **kwargs): + """ + Set the tunable hyperparameters of the DecisionTree model used in LambdaMART + + Parameters + ---------- + min_samples_split : int + Number of samples required to split the internal node + max_depth : int + Maximum depth of the tree + min_samples_leaf : int + Minimum number of samples required to be at the leaf node + max_leaf_nodes : int + These are the maximum number of leaf nodes used to grow the tree + number_of_trees : int + The maximum number of trees that are to be trained for the ensemble. + learning_rate : float + learning rate for the LambdaMART algorithm + """ + self.min_samples_split = min_samples_split + self.max_depth = max_depth + self.min_samples_leaf = min_samples_leaf + self.max_leaf_nodes = max_leaf_nodes + self.number_of_trees = number_of_trees + self.learning_rate = learning_rate + self.criterion = criterion + self.splitter = splitter + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.random_state = random_state + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + + +def query_lambdas(data, k=10): + """ + This is used by the LambdaMART learner to compute the lambda values that are to be used as the target variable for the learner. + + Parameters + ---------- + data : This contains the training data and the predictions from the previous iteration of the learning loop to calculate the lambda values + + Returns + ------- + Returns the lambda values calculated for the current iteration + """ + true_data, model_data = data + worst_order = np.argsort(true_data) + + true_data = true_data[worst_order] + model_data = model_data[worst_order] + + + model_order = np.argsort(model_data) + + idcg = dcg(np.sort(true_data)[-10:][::-1]) + + size = len(true_data) + position_score = np.zeros((size, size)) + + for i in range(size): + for j in range(size): + position_score[model_order[i], model_order[j]] = \ + point_dcg((model_order[j], true_data[model_order[i]])) + + lambdas = np.zeros(size) + + for i in range(size): + for j in range(size): + if true_data[i] > true_data[j]: + + delta_dcg = position_score[i][j] - position_score[i][i] + delta_dcg += position_score[j][i] - position_score[j][j] + + delta_ndcg = abs(delta_dcg / idcg) + + rho = 1 / (1 + math.exp(model_data[i] - model_data[j])) + + lam = rho * delta_ndcg + + lambdas[j] -= lam + lambdas[i] += lam + return lambdas \ No newline at end of file diff --git a/csrank/tests/test_ranking.py b/csrank/tests/test_ranking.py index 22fe72df..e1d8f7fd 100644 --- a/csrank/tests/test_ranking.py +++ b/csrank/tests/test_ranking.py @@ -16,6 +16,8 @@ object_rankers = { FATELINEAR_RANKER: (FATELinearObjectRanker, {"n_hidden_set_units": 12, "batch_size": 1}, (0.0, 1.0)), FETALINEAR_RANKER: (FETALinearObjectRanker, {}, (0.0, 1.0)), + LAMBDAMART: (LambdaMART, {"min_samples_split": 2, "max_depth": 50, "min_samples_leaf": 1, + "max_leaf_nodes": 10}, (0.66, 0.0)), FETA_RANKER: (FETAObjectRanker, {"add_zeroth_order_model": True, "optimizer": optimizer}, (0.0, 1.0)), RANKNET: (RankNet, {"optimizer": optimizer}, (0.0, 1.0)), CMPNET: (CmpNet, {"optimizer": optimizer}, (0.0, 1.0)), From b7d7c01e8b50f9f13a48327d1f1f857f83594a29 Mon Sep 17 00:00:00 2001 From: srinikrish22 Date: Fri, 14 Feb 2020 14:08:49 +0100 Subject: [PATCH 03/11] Fixed some linebreak suggestions and move the class specific metric functions to the lambdamart file itself --- csrank/metrics.py | 40 +------------ csrank/objectranking/lambdamart.py | 90 +++++++++++++++++++++--------- 2 files changed, 67 insertions(+), 63 deletions(-) diff --git a/csrank/metrics.py b/csrank/metrics.py index bc2e338d..9aadb9c8 100644 --- a/csrank/metrics.py +++ b/csrank/metrics.py @@ -55,8 +55,7 @@ 'zero_one_rank_loss_for_scores_ties', 'make_ndcg_at_k_loss', 'kendalls_tau_for_scores', 'spearman_correlation_for_scores', "zero_one_accuracy", - "zero_one_accuracy_for_scores", "topk_categorical_accuracy", - "point_dcg", "dcg", "ndcg"] + "zero_one_accuracy_for_scores", "topk_categorical_accuracy"] def zero_one_rank_loss(y_true, y_pred): @@ -332,39 +331,4 @@ def err(y_true, y_pred, utility_function=None, probability_mapping=None): discounted_document_values = tf.cast(satisfied_at_rank, tf.float64) * discount_at_rank results = tf.reduce_sum(discounted_document_values, axis=1) - return K.mean(results) - -def point_dcg(args): - """ - Point DCG calculation function. Calculates the DCG for a given list. This list is assumed to be consisting of the rankings of documents belonging to the same query - """ - pos, label = args - return (2 ** label - 1) / math.log(pos + 2, 2) - -def dcg(preds): - """ - List DCG calculation function. This function turns the list of rankings into a form which is easier to be passed to the point DCG function - """ - return sum(map(point_dcg, enumerate(preds))) - -def ndcg(preds, k=10): - """ - NDCG calculation function that calculates the NDCG values with the help of the DCG calculation helper functions. - """ - ideal_top = preds[:k] - - true_top = np.array([]) - if len(preds) > 10: - true_top = np.partition(preds, -10)[-k:] - true_top.sort() - else: - true_top = np.sort(preds) - true_top = true_top[::-1] - - max_dcg = dcg(true_top) - ideal_dcg = dcg(ideal_top) - - if max_dcg == 0: - return 1 - - return ideal_dcg / max_dcg \ No newline at end of file + return K.mean(results) \ No newline at end of file diff --git a/csrank/objectranking/lambdamart.py b/csrank/objectranking/lambdamart.py index 94b7cb4f..d352d234 100644 --- a/csrank/objectranking/lambdamart.py +++ b/csrank/objectranking/lambdamart.py @@ -7,7 +7,6 @@ from sklearn.tree import DecisionTreeRegressor from csrank.learner import Learner -from csrank.metrics import point_dcg, dcg, ndcg from csrank.objectranking.object_ranker import ObjectRanker class LambdaMART(ObjectRanker,Learner): @@ -64,7 +63,9 @@ def __init__(self, n_objects=None, n_object_features=None, number_of_trees=5, le def _prepare_train_data(self, X, Y, **kwargs): """ - Transform the data provided in the form of X_train of shape (n_instances,n_objects,n_features) and y_train of shape (n_instances,n_documents) into (n_instances*n_objects,n_features). The output format is similar to the oneprovided by the cusrom dataset reader. + Transform the data provided in the form of X_train of shape (n_instances,n_objects,n_features) + and y_train of shape (n_instances,n_documents) into (n_instances*n_objects,n_features). + The output format is similar to the oneprovided by the cusrom dataset reader. Parameters --------- @@ -76,7 +77,8 @@ def _prepare_train_data(self, X, Y, **kwargs): Rankings of the given objects Returns ------ - Returns an array of shape (n_instances*n_objects,n_features) with the features and relevance scores derived from the ranking provided in y_train + Returns an array of shape (n_instances*n_objects,n_features) with the features and relevance + scores derived from the ranking provided in y_train """ #prepare array like features and imaginary qids @@ -120,7 +122,6 @@ def _prepare_train_data(self, X, Y, **kwargs): return train_data def _group_by_queries(self, data, queries): - """ Internal function which orders the data given as input based on the queries supplied. """ @@ -136,25 +137,27 @@ def _group_by_queries(self, data, queries): def fit(self, X, y, **kwargs): """ - Fit a LambdaMART algorithm to the provided X and y arrays where X contains the features and y being the relevance scores. - - Parameters - ---------- - X : numpy array - (n_instances, n_objects, n_features) - Feature vectors of the objects - Y : numpy array - (n_instances, n_objects) - Rankings of the given objects - **kwargs - Keyword arguments for the fit function + Fit a LambdaMART algorithm to the provided X and y arrays where X contains the features and y + being the relevance scores. + + Parameters + ---------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Rankings of the given objects + **kwargs + Keyword arguments for the fit function - Returns - ------- - Returns the model which is in turn just a list of all the trees that make up the MART model + Returns + ------- + Returns the model which is in turn just a list of all the trees that make up the MART model """ - #check the case if the ensemble already has some trees then clear the trees so that the trees from the previous iteration are not used. + #check the case if the ensemble already has some trees then clear the trees so that the trees + #from the previous iteration are not used. if len(self.ensemble) > 0: self.ensemble.clear() @@ -235,8 +238,8 @@ def predict_scores(self, X, **kwargs): def predict_for_scores(self, scores, **kwargs): """ - Predict rankings for the scores for a given collection of sets of objects (query sets). Wrapper that calls the function of the same name - belonging to the ObjectRanker super class. + Predict rankings for the scores for a given collection of sets of objects (query sets). + Wrapper that calls the function of the same name belonging to the ObjectRanker super class. """ return ObjectRanker.predict_for_scores(self, scores, **kwargs) @@ -327,11 +330,13 @@ def set_tunable_parameters(self, min_samples_split, max_depth, min_samples_leaf, def query_lambdas(data, k=10): """ - This is used by the LambdaMART learner to compute the lambda values that are to be used as the target variable for the learner. + This is used by the LambdaMART learner to compute the lambda values that are to be used as the + target variable for the learner. Parameters ---------- - data : This contains the training data and the predictions from the previous iteration of the learning loop to calculate the lambda values + data : This contains the training data and the predictions from the previous iteration of + the learning loop to calculate the lambda values Returns ------- @@ -373,4 +378,39 @@ def query_lambdas(data, k=10): lambdas[j] -= lam lambdas[i] += lam - return lambdas \ No newline at end of file + return lambdas + +def point_dcg(args): + """ + Point DCG calculation function. Calculates the DCG for a given list. This list is assumed to be consisting of the rankings of documents belonging to the same query + """ + pos, label = args + return (2 ** label - 1) / np.log2(pos + 2) + +def dcg(preds): + """ + List DCG calculation function. This function turns the list of rankings into a form which is easier to be passed to the point DCG function + """ + return sum(map(point_dcg, enumerate(preds))) + +def ndcg(preds, k=10): + """ + NDCG calculation function that calculates the NDCG values with the help of the DCG calculation helper functions. + """ + ideal_top = preds[:k] + + true_top = np.array([]) + if len(preds) > 10: + true_top = np.partition(preds, -10)[-k:] + true_top.sort() + else: + true_top = np.sort(preds) + true_top = true_top[::-1] + + max_dcg = dcg(true_top) + ideal_dcg = dcg(ideal_top) + + if max_dcg == 0: + return 1 + + return ideal_dcg / max_dcg \ No newline at end of file From 80352c8e29fc9e1feb7be83eceefeee9883a9770 Mon Sep 17 00:00:00 2001 From: srinikrish22 Date: Fri, 14 Feb 2020 15:32:34 +0100 Subject: [PATCH 04/11] More changes related to formatting and added LambdaMart class to the init file --- csrank/objectranking/__init__.py | 1 + csrank/objectranking/lambdamart.py | 75 ++++++++++++++++++------------ 2 files changed, 46 insertions(+), 30 deletions(-) diff --git a/csrank/objectranking/__init__.py b/csrank/objectranking/__init__.py index 8eeaab74..ddb18f99 100644 --- a/csrank/objectranking/__init__.py +++ b/csrank/objectranking/__init__.py @@ -8,3 +8,4 @@ from .rank_net import RankNet from .rank_svm import RankSVM from .baseline import RandomBaselineRanker +from .lambdamart import LambdaMART diff --git a/csrank/objectranking/lambdamart.py b/csrank/objectranking/lambdamart.py index d352d234..3077da45 100644 --- a/csrank/objectranking/lambdamart.py +++ b/csrank/objectranking/lambdamart.py @@ -15,10 +15,12 @@ def __init__(self, n_objects=None, n_object_features=None, number_of_trees=5, le criterion="mse", splitter="best", min_weight_fraction_leaf=None, max_features=None, random_state=None, min_impurity_decrease=None, min_impurity_split=None, **kwargs): """ - Create a LambdaMART based rank regression model. This model uses an ensemble of trees that learn to predict - the relevance scores of the documents based on the features, which then can be turned into rankings. - The base learner used is the implementation of Decision Tree from the sklearn tree package. The learner - tries to indirectly optimize the nDCG metric by learning the lambdas. + Create a LambdaMART based rank regression model. This model uses an + ensemble of trees that learn to predict the relevance scores of + the documents based on the features, which then can be turned into + rankings. The base learner used is the implementation of Decision + Tree from the sklearn tree package. The learner tries to indirectly + optimize the nDCG metric by learning the lambdas. Parameters ---------- @@ -63,8 +65,9 @@ def __init__(self, n_objects=None, n_object_features=None, number_of_trees=5, le def _prepare_train_data(self, X, Y, **kwargs): """ - Transform the data provided in the form of X_train of shape (n_instances,n_objects,n_features) - and y_train of shape (n_instances,n_documents) into (n_instances*n_objects,n_features). + Transform the data provided in the form of X_train of shape + (n_instances,n_objects,n_features) and y_train of shape + (n_instances,n_documents) into (n_instances*n_objects,n_features). The output format is similar to the oneprovided by the cusrom dataset reader. Parameters @@ -77,8 +80,8 @@ def _prepare_train_data(self, X, Y, **kwargs): Rankings of the given objects Returns ------ - Returns an array of shape (n_instances*n_objects,n_features) with the features and relevance - scores derived from the ranking provided in y_train + Returns an array of shape (n_instances*n_objects,n_features) with the + features and relevance scores derived from the ranking provided in y_train """ #prepare array like features and imaginary qids @@ -123,7 +126,8 @@ def _prepare_train_data(self, X, Y, **kwargs): def _group_by_queries(self, data, queries): """ - Internal function which orders the data given as input based on the queries supplied. + Internal function which orders the data given as input based + on the queries supplied. """ result = [] curr_query = None @@ -137,8 +141,8 @@ def _group_by_queries(self, data, queries): def fit(self, X, y, **kwargs): """ - Fit a LambdaMART algorithm to the provided X and y arrays where X contains the features and y - being the relevance scores. + Fit a LambdaMART algorithm to the provided X and y arrays where X + contains the features and y being the relevance scores. Parameters ---------- @@ -153,11 +157,12 @@ def fit(self, X, y, **kwargs): Returns ------- - Returns the model which is in turn just a list of all the trees that make up the MART model + Returns the model which is in turn just a list of all the trees that + make up the MART model """ - #check the case if the ensemble already has some trees then clear the trees so that the trees - #from the previous iteration are not used. + #check the case if the ensemble already has some trees then clear the trees + # so that the trees from the previous iteration are not used. if len(self.ensemble) > 0: self.ensemble.clear() @@ -196,8 +201,8 @@ def fit(self, X, y, **kwargs): model_preds += self.learning_rate * prediction #TODO: Remove the next two statements after debugging train_score = self._score(model_preds, scores, queries, 10) - print(" --iteration train score " + str(train_score)) - return self.ensemble + #print(" --iteration train score " + str(train_score)) + #return self.ensemble def _predict_scores_fixed(self, X, **kwargs): """ @@ -224,7 +229,8 @@ def _predict_scores_fixed(self, X, **kwargs): def predict_scores(self, X, **kwargs): """ - Predict the utility scores for each object in the collection of set of objects called a query set. + Predict the utility scores for each object in the collection of set + of objects called a query set. Parameters ---------- @@ -238,8 +244,9 @@ def predict_scores(self, X, **kwargs): def predict_for_scores(self, scores, **kwargs): """ - Predict rankings for the scores for a given collection of sets of objects (query sets). - Wrapper that calls the function of the same name belonging to the ObjectRanker super class. + Predict rankings for the scores for a given collection of sets of objects + (query sets). Wrapper that calls the function of the same name belonging + to the ObjectRanker super class. """ return ObjectRanker.predict_for_scores(self, scores, **kwargs) @@ -248,7 +255,8 @@ def predict(self, X, **kwargs): def _predict(self, pred_vector): """ - Predict the scores for the data supplied by iterating over the ensemble and returning the output. + Predict the scores for the data supplied by iterating over the + ensemble and returning the output. Parameters ---------- @@ -275,11 +283,13 @@ def _score(self, prediction, true_score, query, k=10): ---------- prediction: Predictions of the model true_score: ground truth data of the predictions - query: queries accompanying the prediction data used to calculate the ndcg value + query: queries accompanying the prediction data used to calculate + the ndcg value Returns ------- - Returns the average NDCG value calculated on the basis of the queries supplied, for the predictions + Returns the average NDCG value calculated on the basis of the + queries supplied, for the predictions """ true_data = self._group_by_queries(true_score, query) model_data = self._group_by_queries(prediction, query) @@ -296,7 +306,8 @@ def set_tunable_parameters(self, min_samples_split, max_depth, min_samples_leaf, learning_rate, number_of_trees, criterion, splitter, min_weight_fraction_leaf, max_features, random_state, min_impurity_decrease, min_impurity_split, **kwargs): """ - Set the tunable hyperparameters of the DecisionTree model used in LambdaMART + Set the tunable hyperparameters of the DecisionTree model + used in LambdaMART Parameters ---------- @@ -330,13 +341,13 @@ def set_tunable_parameters(self, min_samples_split, max_depth, min_samples_leaf, def query_lambdas(data, k=10): """ - This is used by the LambdaMART learner to compute the lambda values that are to be used as the - target variable for the learner. + This is used by the LambdaMART learner to compute the lambda values that + are to be used as the target variable for the learner. Parameters ---------- - data : This contains the training data and the predictions from the previous iteration of - the learning loop to calculate the lambda values + data : This contains the training data and the predictions from the + previous iteration of the learning loop to calculate the lambda values Returns ------- @@ -382,20 +393,24 @@ def query_lambdas(data, k=10): def point_dcg(args): """ - Point DCG calculation function. Calculates the DCG for a given list. This list is assumed to be consisting of the rankings of documents belonging to the same query + Point DCG calculation function. Calculates the DCG for a given list. This + list is assumed to be consisting of the rankings of documents belonging to + the same query """ pos, label = args return (2 ** label - 1) / np.log2(pos + 2) def dcg(preds): """ - List DCG calculation function. This function turns the list of rankings into a form which is easier to be passed to the point DCG function + List DCG calculation function. This function turns the list of rankings + into a form which is easier to be passed to the point DCG function """ return sum(map(point_dcg, enumerate(preds))) def ndcg(preds, k=10): """ - NDCG calculation function that calculates the NDCG values with the help of the DCG calculation helper functions. + NDCG calculation function that calculates the NDCG values with the help + of the DCG calculation helper functions. """ ideal_top = preds[:k] From 0bd330be9682fe317039a5fcbed074caeb0c9fbb Mon Sep 17 00:00:00 2001 From: srinikrish22 Date: Sun, 16 Feb 2020 17:55:50 +0100 Subject: [PATCH 05/11] Changed default values for some DecisionTree params --- csrank/objectranking/lambdamart.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/csrank/objectranking/lambdamart.py b/csrank/objectranking/lambdamart.py index 3077da45..2830880a 100644 --- a/csrank/objectranking/lambdamart.py +++ b/csrank/objectranking/lambdamart.py @@ -12,8 +12,8 @@ class LambdaMART(ObjectRanker,Learner): def __init__(self, n_objects=None, n_object_features=None, number_of_trees=5, learning_rate=1e-3, min_samples_split=2, max_depth=50, min_samples_leaf=1, max_leaf_nodes=None, num_process = None, - criterion="mse", splitter="best", min_weight_fraction_leaf=None, max_features=None, random_state=None, - min_impurity_decrease=None, min_impurity_split=None, **kwargs): + criterion="mse", splitter="best", min_weight_fraction_leaf=0.0, max_features=None, random_state=None, + min_impurity_decrease=0.0, min_impurity_split=1e-7, **kwargs): """ Create a LambdaMART based rank regression model. This model uses an ensemble of trees that learn to predict the relevance scores of @@ -62,6 +62,8 @@ def __init__(self, n_objects=None, n_object_features=None, number_of_trees=5, le self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.logger = logging.getLogger(LambdaMART.__name__) + #TODO: Used for Debugging, remove for production + #print("LambdaMART init 2") def _prepare_train_data(self, X, Y, **kwargs): """ @@ -200,7 +202,7 @@ def fit(self, X, y, **kwargs): prediction = tree.predict(features) model_preds += self.learning_rate * prediction #TODO: Remove the next two statements after debugging - train_score = self._score(model_preds, scores, queries, 10) + #train_score = self._score(model_preds, scores, queries, 10) #print(" --iteration train score " + str(train_score)) #return self.ensemble From a078b0b1fd12e9748fc338c55c12fd6292ce0068 Mon Sep 17 00:00:00 2001 From: srinikrish22 Date: Sun, 16 Feb 2020 18:06:41 +0100 Subject: [PATCH 06/11] Added ranker to the constants for testing --- csrank/experiments/constants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrank/experiments/constants.py b/csrank/experiments/constants.py index 97c5314d..78dfcf37 100644 --- a/csrank/experiments/constants.py +++ b/csrank/experiments/constants.py @@ -27,6 +27,7 @@ FATELINEAR_RANKER = "fatelinear_ranker" FETALINEAR_RANKER = "fetalinear_ranker" RANDOM_RANKER = "random_ranker" +LAMBDAMART = "lambdamart" FETA_CHOICE = 'feta_choice' FETALINEAR_CHOICE = "fetalinear_choice" @@ -59,4 +60,4 @@ DCMS = [FETA_DC, FATE_DC, RANKNET_DC, MNL, NLM, GEV, PCL, MLM, RANKSVM_DC, FATELINEAR_DC, FETALINEAR_DC, RANDOM_DC] CHOICE_FUNCTIONS = [FETA_CHOICE, FATE_CHOICE, RANKNET_CHOICE, RANKSVM_CHOICE, GLM_CHOICE, RANDOM_CHOICE, FATELINEAR_CHOICE, FETALINEAR_CHOICE] -OBJECT_RANKERS = [FATE_RANKER, FETA_RANKER, FATELINEAR_RANKER, FETALINEAR_RANKER, RANKSVM, ERR, RANKNET, LISTNET, RANDOM_RANKER] +OBJECT_RANKERS = [FATE_RANKER, FETA_RANKER, FATELINEAR_RANKER, FETALINEAR_RANKER, RANKSVM, ERR, RANKNET, LISTNET, RANDOM_RANKER, LAMBDAMART] From cc5913c0a0677f41da68b75fa9463890ec1d0212 Mon Sep 17 00:00:00 2001 From: srinikrish22 Date: Sun, 16 Feb 2020 18:51:42 +0100 Subject: [PATCH 07/11] Set some default parameters for tunable parameters --- csrank/objectranking/lambdamart.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrank/objectranking/lambdamart.py b/csrank/objectranking/lambdamart.py index 2830880a..b142163d 100644 --- a/csrank/objectranking/lambdamart.py +++ b/csrank/objectranking/lambdamart.py @@ -304,9 +304,9 @@ def _score(self, prediction, true_score, query, k=10): return sum(total_ndcg) / len(total_ndcg) - def set_tunable_parameters(self, min_samples_split, max_depth, min_samples_leaf, max_leaf_nodes, - learning_rate, number_of_trees, criterion, splitter, min_weight_fraction_leaf, - max_features, random_state, min_impurity_decrease, min_impurity_split, **kwargs): + def set_tunable_parameters(self, min_samples_split=2, max_depth=50, min_samples_leaf=1, max_leaf_nodes=None, + learning_rate=1e-3, number_of_trees=5, criterion="mse", splitter="best", min_weight_fraction_leaf=0.0, + max_features=None, random_state=None, min_impurity_decrease=0.0, min_impurity_split=1e-7, **kwargs): """ Set the tunable hyperparameters of the DecisionTree model used in LambdaMART From eb242cbd189c3fa77218e54e1f72135381be4cfc Mon Sep 17 00:00:00 2001 From: srinikrish22 Date: Sun, 16 Feb 2020 23:58:46 +0100 Subject: [PATCH 08/11] Made bugfixes to the fit function --- csrank/objectranking/lambdamart.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrank/objectranking/lambdamart.py b/csrank/objectranking/lambdamart.py index b142163d..bdb9214a 100644 --- a/csrank/objectranking/lambdamart.py +++ b/csrank/objectranking/lambdamart.py @@ -171,7 +171,7 @@ def fit(self, X, y, **kwargs): train_file = self._prepare_train_data(X, y) scores = train_file[:, 0] queries = train_file[:, 1] - features = train_file[:, 3:] + features = train_file[:, 2:] model_preds = np.zeros(len(features)) @@ -203,7 +203,7 @@ def fit(self, X, y, **kwargs): model_preds += self.learning_rate * prediction #TODO: Remove the next two statements after debugging #train_score = self._score(model_preds, scores, queries, 10) - #print(" --iteration train score " + str(train_score)) + #print(" --iteration train score " + str(train_score), X.shape, " and ", y.shape) #return self.ensemble def _predict_scores_fixed(self, X, **kwargs): From bb4c064432ad3276b4a44ba9a3247d92c81471bf Mon Sep 17 00:00:00 2001 From: srinikrish22 Date: Mon, 17 Feb 2020 00:13:46 +0100 Subject: [PATCH 09/11] Run the black formatter on lambdamart file --- csrank/objectranking/lambdamart.py | 172 ++++++++++++++++++----------- 1 file changed, 106 insertions(+), 66 deletions(-) diff --git a/csrank/objectranking/lambdamart.py b/csrank/objectranking/lambdamart.py index bdb9214a..b4c817a5 100644 --- a/csrank/objectranking/lambdamart.py +++ b/csrank/objectranking/lambdamart.py @@ -9,11 +9,28 @@ from csrank.learner import Learner from csrank.objectranking.object_ranker import ObjectRanker -class LambdaMART(ObjectRanker,Learner): - def __init__(self, n_objects=None, n_object_features=None, number_of_trees=5, learning_rate=1e-3, - min_samples_split=2, max_depth=50, min_samples_leaf=1, max_leaf_nodes=None, num_process = None, - criterion="mse", splitter="best", min_weight_fraction_leaf=0.0, max_features=None, random_state=None, - min_impurity_decrease=0.0, min_impurity_split=1e-7, **kwargs): + +class LambdaMART(ObjectRanker, Learner): + def __init__( + self, + n_objects=None, + n_object_features=None, + number_of_trees=5, + learning_rate=1e-3, + min_samples_split=2, + max_depth=50, + min_samples_leaf=1, + max_leaf_nodes=None, + num_process=None, + criterion="mse", + splitter="best", + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + min_impurity_decrease=0.0, + min_impurity_split=1e-7, + **kwargs + ): """ Create a LambdaMART based rank regression model. This model uses an ensemble of trees that learn to predict the relevance scores of @@ -62,8 +79,8 @@ def __init__(self, n_objects=None, n_object_features=None, number_of_trees=5, le self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.logger = logging.getLogger(LambdaMART.__name__) - #TODO: Used for Debugging, remove for production - #print("LambdaMART init 2") + # TODO: Used for Debugging, remove for production + # print("LambdaMART init 2") def _prepare_train_data(self, X, Y, **kwargs): """ @@ -86,46 +103,46 @@ def _prepare_train_data(self, X, Y, **kwargs): features and relevance scores derived from the ranking provided in y_train """ - #prepare array like features and imaginary qids + # prepare array like features and imaginary qids xdim = X.shape[0] # n_instances - qid ydim = X.shape[1] # n_objects - documents zdim = X.shape[2] # n_features features_as_list = deque() - for i in range(0,xdim): - for j in range(0,ydim): - row_as_list=deque([i]) + for i in range(0, xdim): + for j in range(0, ydim): + row_as_list = deque([i]) features = deque() for k in range(0, zdim): row_as_list.append(X[i, j, k]) features_as_list.append(row_as_list) - #Convert rankings to relevance scores + # Convert rankings to relevance scores scores_docsize = Y.shape[1] relscore_train = np.subtract(scores_docsize, Y) - #prepare array like relevance score values + # prepare array like relevance score values xdim_scores = relscore_train.shape[0] ydim_scores = relscore_train.shape[1] scores_as_list = deque() - for x in range(0,xdim_scores): - for y in range(0,ydim_scores): - scores_as_list.append(relscore_train[x,y]) - - #Check if both the dimensions are the same - assert(len(features_as_list)==len(scores_as_list)) - - #convert to numpy and resize the arrays + for x in range(0, xdim_scores): + for y in range(0, ydim_scores): + scores_as_list.append(relscore_train[x, y]) + + # Check if both the dimensions are the same + assert len(features_as_list) == len(scores_as_list) + + # convert to numpy and resize the arrays features = np.asarray(features_as_list) scores_unflat = np.array(scores_as_list) - scores = np.reshape(scores_unflat,(len(scores_unflat),1)) + scores = np.reshape(scores_unflat, (len(scores_unflat), 1)) - #Concatenate the reshaped arrays and return as trainin data - train_data = np.concatenate((scores,features),axis=1) + # Concatenate the reshaped arrays and return as trainin data + train_data = np.concatenate((scores, features), axis=1) return train_data - + def _group_by_queries(self, data, queries): """ Internal function which orders the data given as input based @@ -140,7 +157,7 @@ def _group_by_queries(self, data, queries): result[-1].append(s) result = list(map(np.array, result)) return result - + def fit(self, X, y, **kwargs): """ Fit a LambdaMART algorithm to the provided X and y arrays where X @@ -163,48 +180,52 @@ def fit(self, X, y, **kwargs): make up the MART model """ - #check the case if the ensemble already has some trees then clear the trees + # check the case if the ensemble already has some trees then clear the trees # so that the trees from the previous iteration are not used. if len(self.ensemble) > 0: self.ensemble.clear() - + train_file = self._prepare_train_data(X, y) scores = train_file[:, 0] queries = train_file[:, 1] features = train_file[:, 2:] model_preds = np.zeros(len(features)) - + for i in range(self.number_of_trees): - #print(" Iteration: " + str(i + 1)) + # print(" Iteration: " + str(i + 1)) true_data = self._group_by_queries(scores, queries) model_data = self._group_by_queries(model_preds, queries) with Pool(self.num_process) as pool: - lambdas_draft = pool.map(query_lambdas, list(zip(true_data, model_data))) + lambdas_draft = pool.map( + query_lambdas, list(zip(true_data, model_data)) + ) lambdas = list(chain(*lambdas_draft)) - tree = DecisionTreeRegressor(criterion=self.criterion, - splitter=self.splitter, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - min_weight_fraction_leaf=self.min_weight_fraction_leaf, - max_features=None, - random_state=self.random_state, - max_leaf_nodes=self.max_leaf_nodes, - min_impurity_decrease=self.min_impurity_decrease, - min_impurity_split=self.min_impurity_split) + tree = DecisionTreeRegressor( + criterion=self.criterion, + splitter=self.splitter, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_features=None, + random_state=self.random_state, + max_leaf_nodes=self.max_leaf_nodes, + min_impurity_decrease=self.min_impurity_decrease, + min_impurity_split=self.min_impurity_split, + ) tree.fit(features, lambdas) self.ensemble.append(tree) prediction = tree.predict(features) model_preds += self.learning_rate * prediction - #TODO: Remove the next two statements after debugging - #train_score = self._score(model_preds, scores, queries, 10) - #print(" --iteration train score " + str(train_score), X.shape, " and ", y.shape) - #return self.ensemble + # TODO: Remove the next two statements after debugging + # train_score = self._score(model_preds, scores, queries, 10) + # print(" --iteration train score " + str(train_score), X.shape, " and ", y.shape) + # return self.ensemble def _predict_scores_fixed(self, X, **kwargs): """ @@ -221,7 +242,9 @@ def _predict_scores_fixed(self, X, **kwargs): Returns the scores of each of the objects for each of the samples. """ n_instances, n_objects, n_features = X.shape - self.logger.info("For Test instances {} objects {} features {}".format(*X.shape)) + self.logger.info( + "For Test instances {} objects {} features {}".format(*X.shape) + ) X1 = X.reshape(n_instances * n_objects, n_features) scores = np.zeros(n_instances * n_objects) for tree in self.ensemble: @@ -268,10 +291,10 @@ def _predict(self, pred_vector): ------- results: Predicted scores for each of the objects queries: queries corresponding to the predictions that are made - """ + """ queries = pred_vector[:, 1] - features = pred_vector[:, 2:] - + features = pred_vector[:, 2:] + results = np.zeros(len(features)) for tree in self.ensemble: results += tree.predict(features) * self.learning_rate @@ -304,9 +327,23 @@ def _score(self, prediction, true_score, query, k=10): return sum(total_ndcg) / len(total_ndcg) - def set_tunable_parameters(self, min_samples_split=2, max_depth=50, min_samples_leaf=1, max_leaf_nodes=None, - learning_rate=1e-3, number_of_trees=5, criterion="mse", splitter="best", min_weight_fraction_leaf=0.0, - max_features=None, random_state=None, min_impurity_decrease=0.0, min_impurity_split=1e-7, **kwargs): + def set_tunable_parameters( + self, + min_samples_split=2, + max_depth=50, + min_samples_leaf=1, + max_leaf_nodes=None, + learning_rate=1e-3, + number_of_trees=5, + criterion="mse", + splitter="best", + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + min_impurity_decrease=0.0, + min_impurity_split=1e-7, + **kwargs + ): """ Set the tunable hyperparameters of the DecisionTree model used in LambdaMART @@ -361,7 +398,6 @@ def query_lambdas(data, k=10): true_data = true_data[worst_order] model_data = model_data[worst_order] - model_order = np.argsort(model_data) idcg = dcg(np.sort(true_data)[-10:][::-1]) @@ -371,28 +407,30 @@ def query_lambdas(data, k=10): for i in range(size): for j in range(size): - position_score[model_order[i], model_order[j]] = \ - point_dcg((model_order[j], true_data[model_order[i]])) + position_score[model_order[i], model_order[j]] = point_dcg( + (model_order[j], true_data[model_order[i]]) + ) lambdas = np.zeros(size) for i in range(size): for j in range(size): - if true_data[i] > true_data[j]: + if true_data[i] > true_data[j]: - delta_dcg = position_score[i][j] - position_score[i][i] - delta_dcg += position_score[j][i] - position_score[j][j] + delta_dcg = position_score[i][j] - position_score[i][i] + delta_dcg += position_score[j][i] - position_score[j][j] - delta_ndcg = abs(delta_dcg / idcg) + delta_ndcg = abs(delta_dcg / idcg) - rho = 1 / (1 + math.exp(model_data[i] - model_data[j])) + rho = 1 / (1 + math.exp(model_data[i] - model_data[j])) - lam = rho * delta_ndcg + lam = rho * delta_ndcg - lambdas[j] -= lam - lambdas[i] += lam + lambdas[j] -= lam + lambdas[i] += lam return lambdas + def point_dcg(args): """ Point DCG calculation function. Calculates the DCG for a given list. This @@ -402,6 +440,7 @@ def point_dcg(args): pos, label = args return (2 ** label - 1) / np.log2(pos + 2) + def dcg(preds): """ List DCG calculation function. This function turns the list of rankings @@ -409,6 +448,7 @@ def dcg(preds): """ return sum(map(point_dcg, enumerate(preds))) + def ndcg(preds, k=10): """ NDCG calculation function that calculates the NDCG values with the help @@ -423,11 +463,11 @@ def ndcg(preds, k=10): else: true_top = np.sort(preds) true_top = true_top[::-1] - + max_dcg = dcg(true_top) ideal_dcg = dcg(ideal_top) if max_dcg == 0: return 1 - return ideal_dcg / max_dcg \ No newline at end of file + return ideal_dcg / max_dcg From df665011ea3e6a0372c2b58630aa407b2c7d4887 Mon Sep 17 00:00:00 2001 From: srinikrish22 Date: Mon, 17 Feb 2020 00:19:17 +0100 Subject: [PATCH 10/11] Some final touches --- csrank/objectranking/lambdamart.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/csrank/objectranking/lambdamart.py b/csrank/objectranking/lambdamart.py index b4c817a5..c58f5026 100644 --- a/csrank/objectranking/lambdamart.py +++ b/csrank/objectranking/lambdamart.py @@ -79,8 +79,6 @@ def __init__( self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.logger = logging.getLogger(LambdaMART.__name__) - # TODO: Used for Debugging, remove for production - # print("LambdaMART init 2") def _prepare_train_data(self, X, Y, **kwargs): """ @@ -193,7 +191,6 @@ def fit(self, X, y, **kwargs): model_preds = np.zeros(len(features)) for i in range(self.number_of_trees): - # print(" Iteration: " + str(i + 1)) true_data = self._group_by_queries(scores, queries) model_data = self._group_by_queries(model_preds, queries) @@ -222,10 +219,8 @@ def fit(self, X, y, **kwargs): prediction = tree.predict(features) model_preds += self.learning_rate * prediction - # TODO: Remove the next two statements after debugging # train_score = self._score(model_preds, scores, queries, 10) - # print(" --iteration train score " + str(train_score), X.shape, " and ", y.shape) - # return self.ensemble + # print("iteration"+ i +" train score " + str(train_score)+" "+str(X.shape) + " and "+ str(y.shape)) def _predict_scores_fixed(self, X, **kwargs): """ From d8a4e6e27254b666adbe45bc16c5069608136b5f Mon Sep 17 00:00:00 2001 From: srinikrish22 Date: Mon, 17 Feb 2020 00:30:50 +0100 Subject: [PATCH 11/11] Made changes to testcase --- csrank/tests/test_ranking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrank/tests/test_ranking.py b/csrank/tests/test_ranking.py index e1d8f7fd..78860e65 100644 --- a/csrank/tests/test_ranking.py +++ b/csrank/tests/test_ranking.py @@ -17,7 +17,7 @@ FATELINEAR_RANKER: (FATELinearObjectRanker, {"n_hidden_set_units": 12, "batch_size": 1}, (0.0, 1.0)), FETALINEAR_RANKER: (FETALinearObjectRanker, {}, (0.0, 1.0)), LAMBDAMART: (LambdaMART, {"min_samples_split": 2, "max_depth": 50, "min_samples_leaf": 1, - "max_leaf_nodes": 10}, (0.66, 0.0)), + "max_leaf_nodes": 10}, (0.4, 0.0)), FETA_RANKER: (FETAObjectRanker, {"add_zeroth_order_model": True, "optimizer": optimizer}, (0.0, 1.0)), RANKNET: (RankNet, {"optimizer": optimizer}, (0.0, 1.0)), CMPNET: (CmpNet, {"optimizer": optimizer}, (0.0, 1.0)),