From 01f179e9587897b74e27e1abb69dfd75aa321c45 Mon Sep 17 00:00:00 2001 From: shenkha Date: Mon, 14 Jul 2025 18:33:45 +0400 Subject: [PATCH 01/20] feat(linear): Add ensemble tree model and solver-aware scoring --- libmultilabel/linear/tree.py | 111 +++++++++++++++++++++++++++++++---- linear_trainer.py | 28 ++++++--- main.py | 3 + 3 files changed, 122 insertions(+), 20 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index fe6e94b4..fecb1f45 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -6,12 +6,14 @@ import scipy.sparse as sparse import sklearn.cluster import sklearn.preprocessing +import sklearn.utils from tqdm import tqdm import psutil - from . import linear +from scipy.special import log_expit +#from sparsekmeans import LloydKmeans, ElkanKmeans -__all__ = ["train_tree", "TreeModel"] +__all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"] class Node: @@ -47,20 +49,39 @@ def __init__( root: Node, flat_model: linear.FlatModel, node_ptr: np.ndarray, + options: str, ): self.name = "tree" self.root = root self.flat_model = flat_model self.node_ptr = node_ptr + self.options = options self.multiclass = False self._model_separated = False # Indicates whether the model has been separated for pruning tree. + def _is_lr(self) -> bool: + options = self.options or "" + options_split = options.split() + if "-s" in options_split: + i = options_split.index("-s") + if i + 1 < len(options_split): + solver_type = options_split[i + 1] + return solver_type in ["0", "6", "7"] + return False + + def _get_scores(self, pred, parent_score=0.0): + if self._is_lr(): + #return parent_score - np.log(1 + np.exp(-pred)) + return parent_score + log_expit(pred) + else: + return parent_score - np.square(np.maximum(0, 1 - pred)) + def predict_values( self, x: sparse.csr_matrix, beam_width: int = 10, ) -> np.ndarray: - """Calculate the probability estimates associated with x. + """Calculates the probability estimates associated with x. Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. @@ -109,17 +130,14 @@ def _separate_model_for_pruning_tree(self): **tree_flat_model_params ) self.subtree_models.append(subtree_flatmodel) - + def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray: """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees. - Only subtrees corresponding to the top beam_width candidates from the root are evaluated, skipping the rest to avoid unnecessary computation. - Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int): Number of top candidate branches considered for prediction. - Returns: np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels). """ @@ -129,7 +147,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) # Calculate root decision values and scores root_preds = linear.predict_values(self.root_model, x) - children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds)) + children_scores = self._get_scores(root_preds) slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] all_preds[slice] = root_preds @@ -145,6 +163,8 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) for subtree_idx in range(len(self.root.children)): subtree_model = self.subtree_models[subtree_idx] instances_mask = mask[:, subtree_idx] + if not np.any(instances_mask): + continue reduced_instances = x[np.s_[instances_mask], :] # Locate the position of the subtree root in the weight mapping of all nodes @@ -179,18 +199,18 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra continue slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - children_score = score - np.square(np.maximum(0, 1 - pred)) + children_score = self._get_scores(pred, score) next_level.extend(zip(node.children, children_score.tolist())) cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width] next_level = [] num_labels = len(self.root.label_map) - scores = np.zeros(num_labels) + scores = np.full(num_labels, 0.0) for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred))) + scores[node.label_map] = np.exp(self._get_scores(pred, score)) return scores @@ -258,7 +278,7 @@ def visit(node): pbar.close() flat_model, node_ptr = _flatten_model(root) - return TreeModel(root, flat_model, node_ptr) + return TreeModel(root, flat_model, node_ptr, options) def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int, K: int, dmax: int) -> Node: @@ -382,3 +402,70 @@ def visit(node): node_ptr = np.cumsum([0] + list(map(lambda w: w.shape[1], weights))) return model, node_ptr + + +class EnsembleTreeModel: + """An ensemble of tree models. + The ensemble aggregates predictions from multiple trees to improve accuracy and robustness. + """ + + def __init__(self, tree_models: list[TreeModel]): + """ + Args: + tree_models (list[TreeModel]): A list of trained tree models. + """ + self.name = "ensemble-tree" + self.tree_models = tree_models + self.multiclass = False + + def predict_values(self, x: sparse.csr_matrix, beam_width: int = 10) -> np.ndarray: + """Calculates the averaged probability estimates from all trees in the ensemble. + + Args: + x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. + beam_width (int, optional): Number of candidates considered during beam search for each tree. Defaults to 10. + + Returns: + np.ndarray: A matrix with dimension number of instances * number of classes, containing averaged scores. + """ + all_predictions = [model.predict_values(x, beam_width) for model in self.tree_models] + return np.mean(all_predictions, axis=0) + + +def train_ensemble_tree( + y: sparse.csr_matrix, + x: sparse.csr_matrix, + options: str = "", + K: int = 100, + dmax: int = 10, + n_trees: int = 3, + seed: int = 42, + verbose: bool = True, +) -> EnsembleTreeModel: + """Trains an ensemble of tree models (Parabel/Bonsai-style). + Args: + y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. + x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. + options (str, optional): The option string passed to liblinear. Defaults to ''. + K (int, optional): Maximum degree of nodes in the tree. Defaults to 100. + dmax (int, optional): Maximum depth of the tree. Defaults to 10. + n_trees (int, optional): Number of trees in the ensemble. Defaults to 3. + seed (int, optional): The base random seed for the ensemble. Defaults to 42. + verbose (bool, optional): Output extra progress information. Defaults to True. + + Returns: + EnsembleTreeModel: An ensemble model which can be used for prediction. + """ + tree_models = [] + for i in range(n_trees): + np.random.seed(seed + i) + + tree_model = train_tree(y, x, options, K, dmax, verbose=False) + tree_models.append(tree_model) + + + + if verbose: + print("Ensemble training completed.") + + return EnsembleTreeModel(tree_models) \ No newline at end of file diff --git a/linear_trainer.py b/linear_trainer.py index b0524ee7..a664945b 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -6,6 +6,7 @@ import libmultilabel.linear as linear from libmultilabel.common_utils import dump_log, is_multiclass_dataset +from libmultilabel.linear.tree import train_ensemble_tree from libmultilabel.linear.utils import LINEAR_TECHNIQUES @@ -21,7 +22,7 @@ def linear_test(config, model, datasets, label_mapping): scores = [] predict_kwargs = {} - if model.name == "tree": + if model.name == "tree" or model.name == "ensemble-tree": predict_kwargs["beam_width"] = config.beam_width for i in tqdm(range(ceil(num_instance / config.eval_batch_size))): @@ -48,13 +49,24 @@ def linear_train(datasets, config): if multiclass: raise ValueError("Tree model should only be used with multilabel datasets.") - model = LINEAR_TECHNIQUES[config.linear_technique]( - datasets["train"]["y"], - datasets["train"]["x"], - options=config.liblinear_options, - K=config.tree_degree, - dmax=config.tree_max_depth, - ) + if config.tree_ensemble_models > 1: + model = train_ensemble_tree( + datasets["train"]["y"], + datasets["train"]["x"], + options=config.liblinear_options, + K=config.tree_degree, + dmax=config.tree_max_depth, + n_trees=config.tree_ensemble_models, + seed=config.seed if config.seed is not None else 42, + ) + else: + model = LINEAR_TECHNIQUES[config.linear_technique]( + datasets["train"]["y"], + datasets["train"]["x"], + options=config.liblinear_options, + K=config.tree_degree, + dmax=config.tree_max_depth, + ) else: model = LINEAR_TECHNIQUES[config.linear_technique]( datasets["train"]["y"], diff --git a/main.py b/main.py index 12564f6b..86212980 100644 --- a/main.py +++ b/main.py @@ -223,6 +223,9 @@ def add_all_arguments(parser): parser.add_argument( "--tree_max_depth", type=int, default=10, help="Maximum depth of the tree (default: %(default)s)" ) + parser.add_argument( + "--tree_ensemble_models", type=int, default=1, help="Number of models in the tree ensemble (default: %(default)s)" + ) parser.add_argument( "--beam_width", type=int, From 3c13e37991c70bfcfd0e2528a4ed96dbe8aaa013 Mon Sep 17 00:00:00 2001 From: shenkha Date: Wed, 16 Jul 2025 20:43:33 +0400 Subject: [PATCH 02/20] revert unnecessary changes --- libmultilabel/linear/tree.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index fecb1f45..ba7d6a7f 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -9,9 +9,10 @@ import sklearn.utils from tqdm import tqdm import psutil + from . import linear from scipy.special import log_expit -#from sparsekmeans import LloydKmeans, ElkanKmeans + __all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"] @@ -71,7 +72,6 @@ def _is_lr(self) -> bool: def _get_scores(self, pred, parent_score=0.0): if self._is_lr(): - #return parent_score - np.log(1 + np.exp(-pred)) return parent_score + log_expit(pred) else: return parent_score - np.square(np.maximum(0, 1 - pred)) @@ -133,11 +133,14 @@ def _separate_model_for_pruning_tree(self): def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray: """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees. + Only subtrees corresponding to the top beam_width candidates from the root are evaluated, skipping the rest to avoid unnecessary computation. + Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int): Number of top candidate branches considered for prediction. + Returns: np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels). """ @@ -206,7 +209,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra next_level = [] num_labels = len(self.root.label_map) - scores = np.full(num_labels, 0.0) + scores = np.zeros(num_labels) for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] From b363dbdad569918f29dfd11cf9be896b50d5b3b9 Mon Sep 17 00:00:00 2001 From: shenkha Date: Wed, 16 Jul 2025 20:49:37 +0400 Subject: [PATCH 03/20] revert small changes --- libmultilabel/linear/tree.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index ba7d6a7f..064dc41b 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -81,7 +81,7 @@ def predict_values( x: sparse.csr_matrix, beam_width: int = 10, ) -> np.ndarray: - """Calculates the probability estimates associated with x. + """Calculate the probability estimates associated with x. Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. @@ -166,8 +166,6 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) for subtree_idx in range(len(self.root.children)): subtree_model = self.subtree_models[subtree_idx] instances_mask = mask[:, subtree_idx] - if not np.any(instances_mask): - continue reduced_instances = x[np.s_[instances_mask], :] # Locate the position of the subtree root in the weight mapping of all nodes From 024b5444a222e23d4a1d83cc839fcbf004d2af03 Mon Sep 17 00:00:00 2001 From: shenkha Date: Wed, 16 Jul 2025 20:51:10 +0400 Subject: [PATCH 04/20] remove unnecessary import --- libmultilabel/linear/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 064dc41b..66e9e48d 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -6,7 +6,7 @@ import scipy.sparse as sparse import sklearn.cluster import sklearn.preprocessing -import sklearn.utils + from tqdm import tqdm import psutil From 6335dfec969f1217cdcab1d96dc9f20da1c4bb76 Mon Sep 17 00:00:00 2001 From: shenkha Date: Wed, 16 Jul 2025 20:52:15 +0400 Subject: [PATCH 05/20] revert small change --- libmultilabel/linear/tree.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 66e9e48d..9590c3ff 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -6,7 +6,6 @@ import scipy.sparse as sparse import sklearn.cluster import sklearn.preprocessing - from tqdm import tqdm import psutil From 3016397e6a17d154231fb4696cca6967a1ccee83 Mon Sep 17 00:00:00 2001 From: shenkha Date: Thu, 17 Jul 2025 16:25:19 +0400 Subject: [PATCH 06/20] applied black --- libmultilabel/linear/tree.py | 38 ++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 9590c3ff..3aecb67c 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -57,7 +57,7 @@ def __init__( self.node_ptr = node_ptr self.options = options self.multiclass = False - self._model_separated = False # Indicates whether the model has been separated for pruning tree. + self._model_separated = False # Indicates whether the model has been separated for pruning tree. def _is_lr(self) -> bool: options = self.options or "" @@ -92,13 +92,17 @@ def predict_values( if beam_width >= len(self.root.children): # Beam_width is sufficiently large; pruning not applied. # Calculates decision values for all nodes. - all_preds = linear.predict_values(self.flat_model, x) # number of instances * (number of labels + total number of metalabels) + all_preds = linear.predict_values( + self.flat_model, x + ) # number of instances * (number of labels + total number of metalabels) else: # Beam_width is small; pruning applied to reduce computation. if not self._model_separated: self._separate_model_for_pruning_tree() self._model_separated = True - all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels) + all_preds = self._prune_tree_and_predict_values( + x, beam_width + ) # number of instances * (number of labels + total number of metalabels) return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])]) def _separate_model_for_pruning_tree(self): @@ -106,27 +110,21 @@ def _separate_model_for_pruning_tree(self): This function separates the weights for the root node and its children into (K+1) FlatModel for efficient beam search traversal in Python. """ - tree_flat_model_params = { - 'bias': self.root.model.bias, - 'thresholds': 0, - 'multiclass': False - } + tree_flat_model_params = {"bias": self.root.model.bias, "thresholds": 0, "multiclass": False} slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] self.root_model = linear.FlatModel( - name="root-flattened-tree", - weights=self.flat_model.weights[slice].tocsr(), - **tree_flat_model_params + name="root-flattened-tree", weights=self.flat_model.weights[slice].tocsr(), **tree_flat_model_params ) self.subtree_models = [] for i in range(len(self.root.children)): subtree_weights_start = self.node_ptr[self.root.children[i].index] - subtree_weights_end = self.node_ptr[self.root.children[i+1].index] if i+1 < len(self.root.children) else -1 + subtree_weights_end = ( + self.node_ptr[self.root.children[i + 1].index] if i + 1 < len(self.root.children) else -1 + ) slice = np.s_[:, subtree_weights_start:subtree_weights_end] subtree_flatmodel = linear.FlatModel( - name="subtree-flattened-tree", - weights=self.flat_model.weights[slice].tocsr(), - **tree_flat_model_params + name="subtree-flattened-tree", weights=self.flat_model.weights[slice].tocsr(), **tree_flat_model_params ) self.subtree_models.append(subtree_flatmodel) @@ -135,7 +133,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) Only subtrees corresponding to the top beam_width candidates from the root are evaluated, skipping the rest to avoid unnecessary computation. - + Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int): Number of top candidate branches considered for prediction. @@ -160,7 +158,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) # Build a mask where mask[i, j] is True if the j-th subtree is among the top beam_width subtrees for the i-th instance mask = np.zeros_like(children_scores, dtype=np.bool_) np.put_along_axis(mask, top_beam_width_indices, True, axis=1) - + # Calculate predictions for each subtree with its corresponding instances for subtree_idx in range(len(self.root.children)): subtree_model = self.subtree_models[subtree_idx] @@ -459,13 +457,11 @@ def train_ensemble_tree( tree_models = [] for i in range(n_trees): np.random.seed(seed + i) - + tree_model = train_tree(y, x, options, K, dmax, verbose=False) tree_models.append(tree_model) - - if verbose: print("Ensemble training completed.") - return EnsembleTreeModel(tree_models) \ No newline at end of file + return EnsembleTreeModel(tree_models) From 85c2e196c05684f88975455792dbcf837b4e7a42 Mon Sep 17 00:00:00 2001 From: shenkha Date: Mon, 21 Jul 2025 17:30:23 +0400 Subject: [PATCH 07/20] fixed all --- libmultilabel/linear/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 3aecb67c..017e28fc 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -69,7 +69,7 @@ def _is_lr(self) -> bool: return solver_type in ["0", "6", "7"] return False - def _get_scores(self, pred, parent_score=0.0): + def _get_scores(self, pred: np.ndarray, parent_score: float = 0.0) -> np.ndarray: if self._is_lr(): return parent_score + log_expit(pred) else: From c708199fe24dcb2c3a43d9cb14e128b44f277a4a Mon Sep 17 00:00:00 2001 From: shenkha Date: Mon, 14 Jul 2025 18:33:45 +0400 Subject: [PATCH 08/20] feat(linear): Add ensemble tree model and solver-aware scoring --- libmultilabel/linear/tree.py | 111 +++++++++++++++++++++++++++++++---- linear_trainer.py | 28 ++++++--- main.py | 3 + 3 files changed, 122 insertions(+), 20 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index fe6e94b4..fecb1f45 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -6,12 +6,14 @@ import scipy.sparse as sparse import sklearn.cluster import sklearn.preprocessing +import sklearn.utils from tqdm import tqdm import psutil - from . import linear +from scipy.special import log_expit +#from sparsekmeans import LloydKmeans, ElkanKmeans -__all__ = ["train_tree", "TreeModel"] +__all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"] class Node: @@ -47,20 +49,39 @@ def __init__( root: Node, flat_model: linear.FlatModel, node_ptr: np.ndarray, + options: str, ): self.name = "tree" self.root = root self.flat_model = flat_model self.node_ptr = node_ptr + self.options = options self.multiclass = False self._model_separated = False # Indicates whether the model has been separated for pruning tree. + def _is_lr(self) -> bool: + options = self.options or "" + options_split = options.split() + if "-s" in options_split: + i = options_split.index("-s") + if i + 1 < len(options_split): + solver_type = options_split[i + 1] + return solver_type in ["0", "6", "7"] + return False + + def _get_scores(self, pred, parent_score=0.0): + if self._is_lr(): + #return parent_score - np.log(1 + np.exp(-pred)) + return parent_score + log_expit(pred) + else: + return parent_score - np.square(np.maximum(0, 1 - pred)) + def predict_values( self, x: sparse.csr_matrix, beam_width: int = 10, ) -> np.ndarray: - """Calculate the probability estimates associated with x. + """Calculates the probability estimates associated with x. Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. @@ -109,17 +130,14 @@ def _separate_model_for_pruning_tree(self): **tree_flat_model_params ) self.subtree_models.append(subtree_flatmodel) - + def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray: """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees. - Only subtrees corresponding to the top beam_width candidates from the root are evaluated, skipping the rest to avoid unnecessary computation. - Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int): Number of top candidate branches considered for prediction. - Returns: np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels). """ @@ -129,7 +147,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) # Calculate root decision values and scores root_preds = linear.predict_values(self.root_model, x) - children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds)) + children_scores = self._get_scores(root_preds) slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] all_preds[slice] = root_preds @@ -145,6 +163,8 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) for subtree_idx in range(len(self.root.children)): subtree_model = self.subtree_models[subtree_idx] instances_mask = mask[:, subtree_idx] + if not np.any(instances_mask): + continue reduced_instances = x[np.s_[instances_mask], :] # Locate the position of the subtree root in the weight mapping of all nodes @@ -179,18 +199,18 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra continue slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - children_score = score - np.square(np.maximum(0, 1 - pred)) + children_score = self._get_scores(pred, score) next_level.extend(zip(node.children, children_score.tolist())) cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width] next_level = [] num_labels = len(self.root.label_map) - scores = np.zeros(num_labels) + scores = np.full(num_labels, 0.0) for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred))) + scores[node.label_map] = np.exp(self._get_scores(pred, score)) return scores @@ -258,7 +278,7 @@ def visit(node): pbar.close() flat_model, node_ptr = _flatten_model(root) - return TreeModel(root, flat_model, node_ptr) + return TreeModel(root, flat_model, node_ptr, options) def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int, K: int, dmax: int) -> Node: @@ -382,3 +402,70 @@ def visit(node): node_ptr = np.cumsum([0] + list(map(lambda w: w.shape[1], weights))) return model, node_ptr + + +class EnsembleTreeModel: + """An ensemble of tree models. + The ensemble aggregates predictions from multiple trees to improve accuracy and robustness. + """ + + def __init__(self, tree_models: list[TreeModel]): + """ + Args: + tree_models (list[TreeModel]): A list of trained tree models. + """ + self.name = "ensemble-tree" + self.tree_models = tree_models + self.multiclass = False + + def predict_values(self, x: sparse.csr_matrix, beam_width: int = 10) -> np.ndarray: + """Calculates the averaged probability estimates from all trees in the ensemble. + + Args: + x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. + beam_width (int, optional): Number of candidates considered during beam search for each tree. Defaults to 10. + + Returns: + np.ndarray: A matrix with dimension number of instances * number of classes, containing averaged scores. + """ + all_predictions = [model.predict_values(x, beam_width) for model in self.tree_models] + return np.mean(all_predictions, axis=0) + + +def train_ensemble_tree( + y: sparse.csr_matrix, + x: sparse.csr_matrix, + options: str = "", + K: int = 100, + dmax: int = 10, + n_trees: int = 3, + seed: int = 42, + verbose: bool = True, +) -> EnsembleTreeModel: + """Trains an ensemble of tree models (Parabel/Bonsai-style). + Args: + y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. + x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. + options (str, optional): The option string passed to liblinear. Defaults to ''. + K (int, optional): Maximum degree of nodes in the tree. Defaults to 100. + dmax (int, optional): Maximum depth of the tree. Defaults to 10. + n_trees (int, optional): Number of trees in the ensemble. Defaults to 3. + seed (int, optional): The base random seed for the ensemble. Defaults to 42. + verbose (bool, optional): Output extra progress information. Defaults to True. + + Returns: + EnsembleTreeModel: An ensemble model which can be used for prediction. + """ + tree_models = [] + for i in range(n_trees): + np.random.seed(seed + i) + + tree_model = train_tree(y, x, options, K, dmax, verbose=False) + tree_models.append(tree_model) + + + + if verbose: + print("Ensemble training completed.") + + return EnsembleTreeModel(tree_models) \ No newline at end of file diff --git a/linear_trainer.py b/linear_trainer.py index b0524ee7..a664945b 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -6,6 +6,7 @@ import libmultilabel.linear as linear from libmultilabel.common_utils import dump_log, is_multiclass_dataset +from libmultilabel.linear.tree import train_ensemble_tree from libmultilabel.linear.utils import LINEAR_TECHNIQUES @@ -21,7 +22,7 @@ def linear_test(config, model, datasets, label_mapping): scores = [] predict_kwargs = {} - if model.name == "tree": + if model.name == "tree" or model.name == "ensemble-tree": predict_kwargs["beam_width"] = config.beam_width for i in tqdm(range(ceil(num_instance / config.eval_batch_size))): @@ -48,13 +49,24 @@ def linear_train(datasets, config): if multiclass: raise ValueError("Tree model should only be used with multilabel datasets.") - model = LINEAR_TECHNIQUES[config.linear_technique]( - datasets["train"]["y"], - datasets["train"]["x"], - options=config.liblinear_options, - K=config.tree_degree, - dmax=config.tree_max_depth, - ) + if config.tree_ensemble_models > 1: + model = train_ensemble_tree( + datasets["train"]["y"], + datasets["train"]["x"], + options=config.liblinear_options, + K=config.tree_degree, + dmax=config.tree_max_depth, + n_trees=config.tree_ensemble_models, + seed=config.seed if config.seed is not None else 42, + ) + else: + model = LINEAR_TECHNIQUES[config.linear_technique]( + datasets["train"]["y"], + datasets["train"]["x"], + options=config.liblinear_options, + K=config.tree_degree, + dmax=config.tree_max_depth, + ) else: model = LINEAR_TECHNIQUES[config.linear_technique]( datasets["train"]["y"], diff --git a/main.py b/main.py index 12564f6b..86212980 100644 --- a/main.py +++ b/main.py @@ -223,6 +223,9 @@ def add_all_arguments(parser): parser.add_argument( "--tree_max_depth", type=int, default=10, help="Maximum depth of the tree (default: %(default)s)" ) + parser.add_argument( + "--tree_ensemble_models", type=int, default=1, help="Number of models in the tree ensemble (default: %(default)s)" + ) parser.add_argument( "--beam_width", type=int, From 0dc778c419ce39fcc3ef2c2ccefe01ef51416143 Mon Sep 17 00:00:00 2001 From: shenkha Date: Wed, 16 Jul 2025 20:43:33 +0400 Subject: [PATCH 09/20] revert unnecessary changes --- libmultilabel/linear/tree.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index fecb1f45..ba7d6a7f 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -9,9 +9,10 @@ import sklearn.utils from tqdm import tqdm import psutil + from . import linear from scipy.special import log_expit -#from sparsekmeans import LloydKmeans, ElkanKmeans + __all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"] @@ -71,7 +72,6 @@ def _is_lr(self) -> bool: def _get_scores(self, pred, parent_score=0.0): if self._is_lr(): - #return parent_score - np.log(1 + np.exp(-pred)) return parent_score + log_expit(pred) else: return parent_score - np.square(np.maximum(0, 1 - pred)) @@ -133,11 +133,14 @@ def _separate_model_for_pruning_tree(self): def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray: """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees. + Only subtrees corresponding to the top beam_width candidates from the root are evaluated, skipping the rest to avoid unnecessary computation. + Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int): Number of top candidate branches considered for prediction. + Returns: np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels). """ @@ -206,7 +209,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra next_level = [] num_labels = len(self.root.label_map) - scores = np.full(num_labels, 0.0) + scores = np.zeros(num_labels) for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] From 516f9e005e0062090bf5aebe4377fea6a80c827d Mon Sep 17 00:00:00 2001 From: shenkha Date: Wed, 16 Jul 2025 20:49:37 +0400 Subject: [PATCH 10/20] revert small changes --- libmultilabel/linear/tree.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index ba7d6a7f..064dc41b 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -81,7 +81,7 @@ def predict_values( x: sparse.csr_matrix, beam_width: int = 10, ) -> np.ndarray: - """Calculates the probability estimates associated with x. + """Calculate the probability estimates associated with x. Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. @@ -166,8 +166,6 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) for subtree_idx in range(len(self.root.children)): subtree_model = self.subtree_models[subtree_idx] instances_mask = mask[:, subtree_idx] - if not np.any(instances_mask): - continue reduced_instances = x[np.s_[instances_mask], :] # Locate the position of the subtree root in the weight mapping of all nodes From e60075f385ca31e7c783ac572d3035d7fd50d6fe Mon Sep 17 00:00:00 2001 From: shenkha Date: Wed, 16 Jul 2025 20:51:10 +0400 Subject: [PATCH 11/20] remove unnecessary import --- libmultilabel/linear/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 064dc41b..66e9e48d 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -6,7 +6,7 @@ import scipy.sparse as sparse import sklearn.cluster import sklearn.preprocessing -import sklearn.utils + from tqdm import tqdm import psutil From 2ea17917b33bd4ae850d06a291f59087ae36e8b6 Mon Sep 17 00:00:00 2001 From: shenkha Date: Wed, 16 Jul 2025 20:52:15 +0400 Subject: [PATCH 12/20] revert small change --- libmultilabel/linear/tree.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 66e9e48d..9590c3ff 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -6,7 +6,6 @@ import scipy.sparse as sparse import sklearn.cluster import sklearn.preprocessing - from tqdm import tqdm import psutil From 05bfb71b710fdb4d83d5679fd728059b90833703 Mon Sep 17 00:00:00 2001 From: shenkha Date: Thu, 17 Jul 2025 16:25:19 +0400 Subject: [PATCH 13/20] applied black --- libmultilabel/linear/tree.py | 38 ++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 9590c3ff..3aecb67c 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -57,7 +57,7 @@ def __init__( self.node_ptr = node_ptr self.options = options self.multiclass = False - self._model_separated = False # Indicates whether the model has been separated for pruning tree. + self._model_separated = False # Indicates whether the model has been separated for pruning tree. def _is_lr(self) -> bool: options = self.options or "" @@ -92,13 +92,17 @@ def predict_values( if beam_width >= len(self.root.children): # Beam_width is sufficiently large; pruning not applied. # Calculates decision values for all nodes. - all_preds = linear.predict_values(self.flat_model, x) # number of instances * (number of labels + total number of metalabels) + all_preds = linear.predict_values( + self.flat_model, x + ) # number of instances * (number of labels + total number of metalabels) else: # Beam_width is small; pruning applied to reduce computation. if not self._model_separated: self._separate_model_for_pruning_tree() self._model_separated = True - all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels) + all_preds = self._prune_tree_and_predict_values( + x, beam_width + ) # number of instances * (number of labels + total number of metalabels) return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])]) def _separate_model_for_pruning_tree(self): @@ -106,27 +110,21 @@ def _separate_model_for_pruning_tree(self): This function separates the weights for the root node and its children into (K+1) FlatModel for efficient beam search traversal in Python. """ - tree_flat_model_params = { - 'bias': self.root.model.bias, - 'thresholds': 0, - 'multiclass': False - } + tree_flat_model_params = {"bias": self.root.model.bias, "thresholds": 0, "multiclass": False} slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] self.root_model = linear.FlatModel( - name="root-flattened-tree", - weights=self.flat_model.weights[slice].tocsr(), - **tree_flat_model_params + name="root-flattened-tree", weights=self.flat_model.weights[slice].tocsr(), **tree_flat_model_params ) self.subtree_models = [] for i in range(len(self.root.children)): subtree_weights_start = self.node_ptr[self.root.children[i].index] - subtree_weights_end = self.node_ptr[self.root.children[i+1].index] if i+1 < len(self.root.children) else -1 + subtree_weights_end = ( + self.node_ptr[self.root.children[i + 1].index] if i + 1 < len(self.root.children) else -1 + ) slice = np.s_[:, subtree_weights_start:subtree_weights_end] subtree_flatmodel = linear.FlatModel( - name="subtree-flattened-tree", - weights=self.flat_model.weights[slice].tocsr(), - **tree_flat_model_params + name="subtree-flattened-tree", weights=self.flat_model.weights[slice].tocsr(), **tree_flat_model_params ) self.subtree_models.append(subtree_flatmodel) @@ -135,7 +133,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) Only subtrees corresponding to the top beam_width candidates from the root are evaluated, skipping the rest to avoid unnecessary computation. - + Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int): Number of top candidate branches considered for prediction. @@ -160,7 +158,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) # Build a mask where mask[i, j] is True if the j-th subtree is among the top beam_width subtrees for the i-th instance mask = np.zeros_like(children_scores, dtype=np.bool_) np.put_along_axis(mask, top_beam_width_indices, True, axis=1) - + # Calculate predictions for each subtree with its corresponding instances for subtree_idx in range(len(self.root.children)): subtree_model = self.subtree_models[subtree_idx] @@ -459,13 +457,11 @@ def train_ensemble_tree( tree_models = [] for i in range(n_trees): np.random.seed(seed + i) - + tree_model = train_tree(y, x, options, K, dmax, verbose=False) tree_models.append(tree_model) - - if verbose: print("Ensemble training completed.") - return EnsembleTreeModel(tree_models) \ No newline at end of file + return EnsembleTreeModel(tree_models) From 771eff520136e07e3542141233ed72255b67a5e2 Mon Sep 17 00:00:00 2001 From: shenkha Date: Mon, 21 Jul 2025 17:30:23 +0400 Subject: [PATCH 14/20] fixed all --- libmultilabel/linear/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 3aecb67c..017e28fc 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -69,7 +69,7 @@ def _is_lr(self) -> bool: return solver_type in ["0", "6", "7"] return False - def _get_scores(self, pred, parent_score=0.0): + def _get_scores(self, pred: np.ndarray, parent_score: float = 0.0) -> np.ndarray: if self._is_lr(): return parent_score + log_expit(pred) else: From ab9cde0f558aa25f31491cca9104794326e90e07 Mon Sep 17 00:00:00 2001 From: shenkha Date: Wed, 23 Jul 2025 11:34:06 +0400 Subject: [PATCH 15/20] keep the ensemble implementation only and remove the scoring aware --- libmultilabel/linear/tree.py | 60 +++++++++++++----------------------- 1 file changed, 21 insertions(+), 39 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 017e28fc..d630ff24 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -10,8 +10,6 @@ import psutil from . import linear -from scipy.special import log_expit - __all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"] @@ -49,31 +47,13 @@ def __init__( root: Node, flat_model: linear.FlatModel, node_ptr: np.ndarray, - options: str, ): self.name = "tree" self.root = root self.flat_model = flat_model self.node_ptr = node_ptr - self.options = options self.multiclass = False - self._model_separated = False # Indicates whether the model has been separated for pruning tree. - - def _is_lr(self) -> bool: - options = self.options or "" - options_split = options.split() - if "-s" in options_split: - i = options_split.index("-s") - if i + 1 < len(options_split): - solver_type = options_split[i + 1] - return solver_type in ["0", "6", "7"] - return False - - def _get_scores(self, pred: np.ndarray, parent_score: float = 0.0) -> np.ndarray: - if self._is_lr(): - return parent_score + log_expit(pred) - else: - return parent_score - np.square(np.maximum(0, 1 - pred)) + self._model_separated = False # Indicates whether the model has been separated for pruning tree. def predict_values( self, @@ -92,17 +72,13 @@ def predict_values( if beam_width >= len(self.root.children): # Beam_width is sufficiently large; pruning not applied. # Calculates decision values for all nodes. - all_preds = linear.predict_values( - self.flat_model, x - ) # number of instances * (number of labels + total number of metalabels) + all_preds = linear.predict_values(self.flat_model, x) # number of instances * (number of labels + total number of metalabels) else: # Beam_width is small; pruning applied to reduce computation. if not self._model_separated: self._separate_model_for_pruning_tree() self._model_separated = True - all_preds = self._prune_tree_and_predict_values( - x, beam_width - ) # number of instances * (number of labels + total number of metalabels) + all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels) return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])]) def _separate_model_for_pruning_tree(self): @@ -110,24 +86,30 @@ def _separate_model_for_pruning_tree(self): This function separates the weights for the root node and its children into (K+1) FlatModel for efficient beam search traversal in Python. """ - tree_flat_model_params = {"bias": self.root.model.bias, "thresholds": 0, "multiclass": False} + tree_flat_model_params = { + 'bias': self.root.model.bias, + 'thresholds': 0, + 'multiclass': False + } slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] self.root_model = linear.FlatModel( - name="root-flattened-tree", weights=self.flat_model.weights[slice].tocsr(), **tree_flat_model_params + name="root-flattened-tree", + weights=self.flat_model.weights[slice].tocsr(), + **tree_flat_model_params ) self.subtree_models = [] for i in range(len(self.root.children)): subtree_weights_start = self.node_ptr[self.root.children[i].index] - subtree_weights_end = ( - self.node_ptr[self.root.children[i + 1].index] if i + 1 < len(self.root.children) else -1 - ) + subtree_weights_end = self.node_ptr[self.root.children[i+1].index] if i+1 < len(self.root.children) else -1 slice = np.s_[:, subtree_weights_start:subtree_weights_end] subtree_flatmodel = linear.FlatModel( - name="subtree-flattened-tree", weights=self.flat_model.weights[slice].tocsr(), **tree_flat_model_params + name="subtree-flattened-tree", + weights=self.flat_model.weights[slice].tocsr(), + **tree_flat_model_params ) self.subtree_models.append(subtree_flatmodel) - + def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray: """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees. @@ -147,7 +129,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) # Calculate root decision values and scores root_preds = linear.predict_values(self.root_model, x) - children_scores = self._get_scores(root_preds) + children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds)) slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] all_preds[slice] = root_preds @@ -158,7 +140,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) # Build a mask where mask[i, j] is True if the j-th subtree is among the top beam_width subtrees for the i-th instance mask = np.zeros_like(children_scores, dtype=np.bool_) np.put_along_axis(mask, top_beam_width_indices, True, axis=1) - + # Calculate predictions for each subtree with its corresponding instances for subtree_idx in range(len(self.root.children)): subtree_model = self.subtree_models[subtree_idx] @@ -197,7 +179,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra continue slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - children_score = self._get_scores(pred, score) + children_score = score - np.square(np.maximum(0, 1 - pred)) next_level.extend(zip(node.children, children_score.tolist())) cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width] @@ -208,7 +190,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - scores[node.label_map] = np.exp(self._get_scores(pred, score)) + scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred))) return scores @@ -276,7 +258,7 @@ def visit(node): pbar.close() flat_model, node_ptr = _flatten_model(root) - return TreeModel(root, flat_model, node_ptr, options) + return TreeModel(root, flat_model, node_ptr) def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int, K: int, dmax: int) -> Node: From 5419063986aa441a50e33830652c4a23f2557988 Mon Sep 17 00:00:00 2001 From: shenkha Date: Wed, 23 Jul 2025 13:54:09 +0400 Subject: [PATCH 16/20] resolve SW's comment --- linear_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/linear_trainer.py b/linear_trainer.py index a664945b..8fbf699a 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -6,7 +6,7 @@ import libmultilabel.linear as linear from libmultilabel.common_utils import dump_log, is_multiclass_dataset -from libmultilabel.linear.tree import train_ensemble_tree +from libmultilabel.linear.tree import EnsembleTreeModel, TreeModel, train_ensemble_tree from libmultilabel.linear.utils import LINEAR_TECHNIQUES @@ -22,7 +22,7 @@ def linear_test(config, model, datasets, label_mapping): scores = [] predict_kwargs = {} - if model.name == "tree" or model.name == "ensemble-tree": + if isinstance(model, (TreeModel, EnsembleTreeModel)): predict_kwargs["beam_width"] = config.beam_width for i in tqdm(range(ceil(num_instance / config.eval_batch_size))): From 24f9a6be5be029d6b9a385615e59f2c1737b8a7d Mon Sep 17 00:00:00 2001 From: shenkha Date: Sun, 3 Aug 2025 17:18:24 +0400 Subject: [PATCH 17/20] making dmax and K as global variables --- libmultilabel/linear/tree.py | 23 ++++++++--------------- linear_trainer.py | 7 ++----- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index d630ff24..98f95ceb 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -13,6 +13,9 @@ __all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"] +K = 100 +DMAX = 10 + class Node: def __init__( @@ -198,8 +201,6 @@ def train_tree( y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", - K=100, - dmax=10, verbose: bool = True, ) -> TreeModel: """Train a linear model for multi-label data using a divide-and-conquer strategy. @@ -209,8 +210,6 @@ def train_tree( y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. options (str): The option string passed to liblinear. - K (int, optional): Maximum degree of nodes in the tree. Defaults to 100. - dmax (int, optional): Maximum depth of the tree. Defaults to 10. verbose (bool, optional): Output extra progress information. Defaults to True. Returns: @@ -218,7 +217,7 @@ def train_tree( """ label_representation = (y.T * x).tocsr() label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) - root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax) + root = _build_tree(label_representation, np.arange(y.shape[1]), 0) root.is_root = True num_nodes = 0 @@ -261,20 +260,18 @@ def visit(node): return TreeModel(root, flat_model, node_ptr) -def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int, K: int, dmax: int) -> Node: +def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int) -> Node: """Build the tree recursively by kmeans clustering. Args: label_representation (sparse.csr_matrix): A matrix with dimensions number of classes under this node * number of features. label_map (np.ndarray): Maps 0..label_representation.shape[0] to the original label indices. d (int): Current depth. - K (int): Maximum degree of nodes in the tree. - dmax (int): Maximum depth of the tree. Returns: Node: Root of the (sub)tree built from label_representation. """ - if d >= dmax or label_representation.shape[0] <= K: + if d >= DMAX or label_representation.shape[0] <= K: return Node(label_map=label_map, children=[]) metalabels = ( @@ -294,7 +291,7 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, for i in range(K): child_representation = label_representation[metalabels == i] child_map = label_map[metalabels == i] - child = _build_tree(child_representation, child_map, d + 1, K, dmax) + child = _build_tree(child_representation, child_map, d + 1) children.append(child) return Node(label_map=label_map, children=children) @@ -416,8 +413,6 @@ def train_ensemble_tree( y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", - K: int = 100, - dmax: int = 10, n_trees: int = 3, seed: int = 42, verbose: bool = True, @@ -427,8 +422,6 @@ def train_ensemble_tree( y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. options (str, optional): The option string passed to liblinear. Defaults to ''. - K (int, optional): Maximum degree of nodes in the tree. Defaults to 100. - dmax (int, optional): Maximum depth of the tree. Defaults to 10. n_trees (int, optional): Number of trees in the ensemble. Defaults to 3. seed (int, optional): The base random seed for the ensemble. Defaults to 42. verbose (bool, optional): Output extra progress information. Defaults to True. @@ -440,7 +433,7 @@ def train_ensemble_tree( for i in range(n_trees): np.random.seed(seed + i) - tree_model = train_tree(y, x, options, K, dmax, verbose=False) + tree_model = train_tree(y, x, options, verbose=False) tree_models.append(tree_model) if verbose: diff --git a/linear_trainer.py b/linear_trainer.py index 8fbf699a..f8a46a2f 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -48,14 +48,13 @@ def linear_train(datasets, config): if config.linear_technique == "tree": if multiclass: raise ValueError("Tree model should only be used with multilabel datasets.") - + linear.tree.K = config.tree_degree + linear.tree.DMAX = config.tree_max_depth if config.tree_ensemble_models > 1: model = train_ensemble_tree( datasets["train"]["y"], datasets["train"]["x"], options=config.liblinear_options, - K=config.tree_degree, - dmax=config.tree_max_depth, n_trees=config.tree_ensemble_models, seed=config.seed if config.seed is not None else 42, ) @@ -64,8 +63,6 @@ def linear_train(datasets, config): datasets["train"]["y"], datasets["train"]["x"], options=config.liblinear_options, - K=config.tree_degree, - dmax=config.tree_max_depth, ) else: model = LINEAR_TECHNIQUES[config.linear_technique]( From 0669c9fce4fd1b45c636610a18aeb7968c92b96b Mon Sep 17 00:00:00 2001 From: shenkha Date: Mon, 4 Aug 2025 11:21:09 +0400 Subject: [PATCH 18/20] making dmax and K as global default value --- libmultilabel/linear/tree.py | 27 ++++++++++++++++++--------- linear_trainer.py | 7 +++++-- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 98f95ceb..f1fa7ec2 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -13,8 +13,8 @@ __all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"] -K = 100 -DMAX = 10 +DEFAULT_K = 100 +DEFAULT_DMAX = 10 class Node: @@ -201,6 +201,8 @@ def train_tree( y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", + K=DEFAULT_K, + dmax=DEFAULT_DMAX, verbose: bool = True, ) -> TreeModel: """Train a linear model for multi-label data using a divide-and-conquer strategy. @@ -210,6 +212,8 @@ def train_tree( y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. options (str): The option string passed to liblinear. + K (int, optional): Maximum degree of nodes in the tree. Defaults to 100. + dmax (int, optional): Maximum depth of the tree. Defaults to 10. verbose (bool, optional): Output extra progress information. Defaults to True. Returns: @@ -217,7 +221,7 @@ def train_tree( """ label_representation = (y.T * x).tocsr() label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) - root = _build_tree(label_representation, np.arange(y.shape[1]), 0) + root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax) root.is_root = True num_nodes = 0 @@ -260,18 +264,20 @@ def visit(node): return TreeModel(root, flat_model, node_ptr) -def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int) -> Node: +def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int, K: int, dmax: int) -> Node: """Build the tree recursively by kmeans clustering. Args: label_representation (sparse.csr_matrix): A matrix with dimensions number of classes under this node * number of features. label_map (np.ndarray): Maps 0..label_representation.shape[0] to the original label indices. d (int): Current depth. + K (int): Maximum degree of nodes in the tree. + dmax (int): Maximum depth of the tree. Returns: Node: Root of the (sub)tree built from label_representation. """ - if d >= DMAX or label_representation.shape[0] <= K: + if d >= dmax or label_representation.shape[0] <= K: return Node(label_map=label_map, children=[]) metalabels = ( @@ -291,7 +297,7 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, for i in range(K): child_representation = label_representation[metalabels == i] child_map = label_map[metalabels == i] - child = _build_tree(child_representation, child_map, d + 1) + child = _build_tree(child_representation, child_map, d + 1, K, dmax) children.append(child) return Node(label_map=label_map, children=children) @@ -413,6 +419,8 @@ def train_ensemble_tree( y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", + K: int = DEFAULT_K, + dmax: int = DEFAULT_DMAX, n_trees: int = 3, seed: int = 42, verbose: bool = True, @@ -422,6 +430,8 @@ def train_ensemble_tree( y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes. x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. options (str, optional): The option string passed to liblinear. Defaults to ''. + K (int, optional): Maximum degree of nodes in the tree. Defaults to 100. + dmax (int, optional): Maximum depth of the tree. Defaults to 10. n_trees (int, optional): Number of trees in the ensemble. Defaults to 3. seed (int, optional): The base random seed for the ensemble. Defaults to 42. verbose (bool, optional): Output extra progress information. Defaults to True. @@ -433,10 +443,9 @@ def train_ensemble_tree( for i in range(n_trees): np.random.seed(seed + i) - tree_model = train_tree(y, x, options, verbose=False) + tree_model = train_tree(y, x, options, K, dmax, verbose) tree_models.append(tree_model) - if verbose: - print("Ensemble training completed.") + print("Ensemble training completed.") return EnsembleTreeModel(tree_models) diff --git a/linear_trainer.py b/linear_trainer.py index f8a46a2f..8fbf699a 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -48,13 +48,14 @@ def linear_train(datasets, config): if config.linear_technique == "tree": if multiclass: raise ValueError("Tree model should only be used with multilabel datasets.") - linear.tree.K = config.tree_degree - linear.tree.DMAX = config.tree_max_depth + if config.tree_ensemble_models > 1: model = train_ensemble_tree( datasets["train"]["y"], datasets["train"]["x"], options=config.liblinear_options, + K=config.tree_degree, + dmax=config.tree_max_depth, n_trees=config.tree_ensemble_models, seed=config.seed if config.seed is not None else 42, ) @@ -63,6 +64,8 @@ def linear_train(datasets, config): datasets["train"]["y"], datasets["train"]["x"], options=config.liblinear_options, + K=config.tree_degree, + dmax=config.tree_max_depth, ) else: model = LINEAR_TECHNIQUES[config.linear_technique]( From 65521b9f92e2debaadc33e57b6c78aaa29f0ffda Mon Sep 17 00:00:00 2001 From: shenkha Date: Tue, 5 Aug 2025 11:52:21 +0400 Subject: [PATCH 19/20] adding the tutorial tree webpage --- docs/examples/plot_linear_tree_tutorial.py | 113 ++++++++++++++++++--- 1 file changed, 97 insertions(+), 16 deletions(-) diff --git a/docs/examples/plot_linear_tree_tutorial.py b/docs/examples/plot_linear_tree_tutorial.py index 846ae88a..d0c70318 100644 --- a/docs/examples/plot_linear_tree_tutorial.py +++ b/docs/examples/plot_linear_tree_tutorial.py @@ -2,35 +2,38 @@ Handling Data with Many Labels Using Linear Methods ==================================================== -For the case that the amount of labels is very large, -the training time of the standard ``train_1vsrest`` method may be unpleasantly long. -The ``train_tree`` method in LibMultiLabel can vastly improve the training time on such data sets. +For datasets with a very large number of labels, the training time of the standard ``train_1vsrest`` method can be prohibitively long. LibMultiLabel offers tree-based methods like ``train_tree`` and ``train_ensemble_tree`` to vastly improve training time in such scenarios. -To illustrate this speedup, we will use the `EUR-Lex dataset `_, which contains 3,956 labels. -The data in the following example is downloaded under the directory ``data/eur-lex`` -Users can use the following command to easily apply the ``train_tree`` method. - -.. code-block:: bash - - $ python3 main.py --training_file data/eur-lex/train.txt - --test_file data/eur-lex/test.txt - --linear - --linear_technique tree - -Besides CLI usage, users can also use API to apply ``train_tree`` method. -Below is an example. +We will use the `EUR-Lex dataset `_, which contains 3,956 labels. The data is assumed to be downloaded under the directory ``data/eur-lex``. """ import math import libmultilabel.linear as linear import time +# Load and preprocess the dataset datasets = linear.load_dataset("txt", "data/eurlex/train.txt", "data/eurlex/test.txt") preprocessor = linear.Preprocessor() datasets = preprocessor.fit_transform(datasets) +###################################################################### +# Standard Training and Prediction +# -------------------------------- +# +# Users can use the following command to easily apply the ``train_tree`` method. +# +# .. code-block:: bash +# +# $ python3 main.py --training_file data/eur-lex/train.txt \\ +# --test_file data/eur-lex/test.txt \\ +# --linear \\ +# --linear_technique tree +# +# Besides CLI usage, users can also use API to apply ``train_tree`` method. +# Below is an example. + training_start = time.time() # the standard one-vs-rest method for multi-label problems ovr_model = linear.train_1vsrest(datasets["train"]["y"], datasets["train"]["x"]) @@ -99,3 +102,81 @@ def metrics_in_batches(model): print("Score of 1vsrest:", metrics_in_batches(ovr_model)) print("Score of tree:", metrics_in_batches(tree_model)) + +###################################################################### +# Ensemble of Tree Models +# ----------------------- +# +# While the ``train_tree`` method offers a significant speedup, its accuracy can sometimes be slightly lower than the standard one-vs-rest approach. +# The ``train_ensemble_tree`` method can help bridge this gap by training multiple tree models and averaging their predictions. +# +# Users can use the following command to easily apply the ``train_ensemble_tree`` method. +# The number of trees in the ensemble can be controlled with the ``--tree_ensemble_models`` argument. +# +# .. code-block:: bash +# +# $ python3 main.py --training_file data/eur-lex/train.txt \\ +# --test_file data/eur-lex/test.txt \\ +# --linear \\ +# --linear_technique tree \\ +# --tree_ensemble_models 3 +# +# This command trains an ensemble of 3 tree models. If ``--tree_ensemble_models`` is not specified, it defaults to 1 (a single tree). +# +# Besides CLI usage, users can also use the API to apply the ``train_ensemble_tree`` method. +# Below is an example. + +# We have already trained a single tree model as a baseline. +# Now, let's train an ensemble of 3 tree models. +training_start = time.time() +ensemble_model = linear.train_ensemble_tree( + datasets["train"]["y"], datasets["train"]["x"], n_trees=3 +) +training_end = time.time() +print("Training time of ensemble tree: {:10.2f}".format(training_end - training_start)) + +###################################################################### +# On a machine with an AMD-7950X CPU, +# the ``train_ensemble_tree`` function with 3 trees took `421.15` seconds, +# while the single tree took `144.37` seconds. +# As expected, training an ensemble takes longer, roughly proportional to the number of trees. +# +# Now, let's see if this additional training time translates to better performance. +# We'll compute the same P@K metrics on the test set for both the single tree and the ensemble model. + +# `tree_preds` and `target` are already computed in the previous section. +ensemble_preds = linear.predict_values(ensemble_model, datasets["test"]["x"]) + +# `tree_score` is already computed. +print("Score of single tree:", tree_score) + +ensemble_score = linear.compute_metrics(ensemble_preds, target, ["P@1", "P@3", "P@5"]) +print("Score of ensemble tree:", ensemble_score) + +###################################################################### +# While training an ensemble takes longer, it often leads to better predictive performance. +# The following table shows a comparison between a single tree and ensembles +# of 3, 10, and 15 trees on several benchmark datasets. +# +# .. table:: Benchmark Results for Single and Ensemble Tree Models (P@K in %) +# +# +---------------+-----------------+-------+-------+-------+ +# | Dataset | Model | P@1 | P@3 | P@5 | +# +===============+=================+=======+=======+=======+ +# | EURLex-4k | Single Tree | 82.35 | 68.98 | 57.62 | +# | +-----------------+-------+-------+-------+ +# | | Ensemble-3 | 82.38 | 69.28 | 58.01 | +# | +-----------------+-------+-------+-------+ +# | | Ensemble-10 | 82.74 | 69.66 | 58.39 | +# | +-----------------+-------+-------+-------+ +# | | Ensemble-15 | 82.61 | 69.56 | 58.29 | +# +---------------+-----------------+-------+-------+-------+ +# | EURLex-57k | Single Tree | 90.77 | 80.81 | 67.82 | +# | +-----------------+-------+-------+-------+ +# | | Ensemble-3 | 91.02 | 81.06 | 68.26 | +# | +-----------------+-------+-------+-------+ +# | | Ensemble-10 | 91.23 | 81.22 | 68.34 | +# | +-----------------+-------+-------+-------+ +# | | Ensemble-15 | 91.25 | 81.31 | 68.34 | +# +---------------+-----------------+-------+-------+-------+ + From a67ee1185e1df669c86b3a4abf2400fb5c8d4434 Mon Sep 17 00:00:00 2001 From: shenkha Date: Wed, 13 Aug 2025 14:31:39 +0400 Subject: [PATCH 20/20] final fix with seed --- libmultilabel/linear/tree.py | 7 +++++-- linear_trainer.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index f1fa7ec2..34f06370 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -422,8 +422,8 @@ def train_ensemble_tree( K: int = DEFAULT_K, dmax: int = DEFAULT_DMAX, n_trees: int = 3, - seed: int = 42, verbose: bool = True, + seed: int = None, ) -> EnsembleTreeModel: """Trains an ensemble of tree models (Parabel/Bonsai-style). Args: @@ -433,12 +433,15 @@ def train_ensemble_tree( K (int, optional): Maximum degree of nodes in the tree. Defaults to 100. dmax (int, optional): Maximum depth of the tree. Defaults to 10. n_trees (int, optional): Number of trees in the ensemble. Defaults to 3. - seed (int, optional): The base random seed for the ensemble. Defaults to 42. verbose (bool, optional): Output extra progress information. Defaults to True. + seed (int, optional): The base random seed for the ensemble. Defaults to None, which will use 42. Returns: EnsembleTreeModel: An ensemble model which can be used for prediction. """ + if seed is None: + seed = 42 + tree_models = [] for i in range(n_trees): np.random.seed(seed + i) diff --git a/linear_trainer.py b/linear_trainer.py index 8fbf699a..b9133857 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -57,7 +57,7 @@ def linear_train(datasets, config): K=config.tree_degree, dmax=config.tree_max_depth, n_trees=config.tree_ensemble_models, - seed=config.seed if config.seed is not None else 42, + seed=config.seed, ) else: model = LINEAR_TECHNIQUES[config.linear_technique](