From 01f179e9587897b74e27e1abb69dfd75aa321c45 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Mon, 14 Jul 2025 18:33:45 +0400
Subject: [PATCH 01/20] feat(linear): Add ensemble tree model and solver-aware
 scoring

---
 libmultilabel/linear/tree.py | 111 +++++++++++++++++++++++++++++++----
 linear_trainer.py            |  28 ++++++---
 main.py                      |   3 +
 3 files changed, 122 insertions(+), 20 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index fe6e94b4..fecb1f45 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -6,12 +6,14 @@
 import scipy.sparse as sparse
 import sklearn.cluster
 import sklearn.preprocessing
+import sklearn.utils
 from tqdm import tqdm
 import psutil
-
 from . import linear
+from scipy.special import log_expit
+#from sparsekmeans import LloydKmeans, ElkanKmeans
 
-__all__ = ["train_tree", "TreeModel"]
+__all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"]
 
 
 class Node:
@@ -47,20 +49,39 @@ def __init__(
         root: Node,
         flat_model: linear.FlatModel,
         node_ptr: np.ndarray,
+        options: str,
     ):
         self.name = "tree"
         self.root = root
         self.flat_model = flat_model
         self.node_ptr = node_ptr
+        self.options = options
         self.multiclass = False
         self._model_separated = False # Indicates whether the model has been separated for pruning tree.
 
+    def _is_lr(self) -> bool:
+        options = self.options or ""
+        options_split = options.split()
+        if "-s" in options_split:
+            i = options_split.index("-s")
+            if i + 1 < len(options_split):
+                solver_type = options_split[i + 1]
+                return solver_type in ["0", "6", "7"]
+        return False
+
+    def _get_scores(self, pred, parent_score=0.0):
+        if self._is_lr():
+            #return parent_score - np.log(1 + np.exp(-pred))
+            return parent_score + log_expit(pred)
+        else:
+            return parent_score - np.square(np.maximum(0, 1 - pred))
+
     def predict_values(
         self,
         x: sparse.csr_matrix,
         beam_width: int = 10,
     ) -> np.ndarray:
-        """Calculate the probability estimates associated with x.
+        """Calculates the probability estimates associated with x.
 
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
@@ -109,17 +130,14 @@ def _separate_model_for_pruning_tree(self):
                 **tree_flat_model_params
             )
             self.subtree_models.append(subtree_flatmodel)
-        
+
     def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray:
         """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees.
-
         Only subtrees corresponding to the top beam_width candidates from the root are evaluated,
         skipping the rest to avoid unnecessary computation.
-
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
             beam_width (int): Number of top candidate branches considered for prediction.
-
         Returns:
             np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels).
         """
@@ -129,7 +147,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         # Calculate root decision values and scores
         root_preds = linear.predict_values(self.root_model, x)
-        children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds))
+        children_scores = self._get_scores(root_preds)
 
         slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]]
         all_preds[slice] = root_preds
@@ -145,6 +163,8 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
         for subtree_idx in range(len(self.root.children)):
             subtree_model = self.subtree_models[subtree_idx]
             instances_mask = mask[:, subtree_idx]
+            if not np.any(instances_mask):
+                continue
             reduced_instances = x[np.s_[instances_mask], :]
 
             # Locate the position of the subtree root in the weight mapping of all nodes
@@ -179,18 +199,18 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
                     continue
                 slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
                 pred = instance_preds[slice]
-                children_score = score - np.square(np.maximum(0, 1 - pred))
+                children_score = self._get_scores(pred, score)
                 next_level.extend(zip(node.children, children_score.tolist()))
 
             cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width]
             next_level = []
 
         num_labels = len(self.root.label_map)
-        scores = np.zeros(num_labels)
+        scores = np.full(num_labels, 0.0)
         for node, score in cur_level:
             slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
             pred = instance_preds[slice]
-            scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred)))
+            scores[node.label_map] = np.exp(self._get_scores(pred, score))
         return scores
 
 
@@ -258,7 +278,7 @@ def visit(node):
     pbar.close()
 
     flat_model, node_ptr = _flatten_model(root)
-    return TreeModel(root, flat_model, node_ptr)
+    return TreeModel(root, flat_model, node_ptr, options)
 
 
 def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int, K: int, dmax: int) -> Node:
@@ -382,3 +402,70 @@ def visit(node):
     node_ptr = np.cumsum([0] + list(map(lambda w: w.shape[1], weights)))
 
     return model, node_ptr
+
+
+class EnsembleTreeModel:
+    """An ensemble of tree models.
+    The ensemble aggregates predictions from multiple trees to improve accuracy and robustness.
+    """
+
+    def __init__(self, tree_models: list[TreeModel]):
+        """
+        Args:
+            tree_models (list[TreeModel]): A list of trained tree models.
+        """
+        self.name = "ensemble-tree"
+        self.tree_models = tree_models
+        self.multiclass = False
+
+    def predict_values(self, x: sparse.csr_matrix, beam_width: int = 10) -> np.ndarray:
+        """Calculates the averaged probability estimates from all trees in the ensemble.
+
+        Args:
+            x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
+            beam_width (int, optional): Number of candidates considered during beam search for each tree. Defaults to 10.
+
+        Returns:
+            np.ndarray: A matrix with dimension number of instances * number of classes, containing averaged scores.
+        """
+        all_predictions = [model.predict_values(x, beam_width) for model in self.tree_models]
+        return np.mean(all_predictions, axis=0)
+
+
+def train_ensemble_tree(
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    options: str = "",
+    K: int = 100,
+    dmax: int = 10,
+    n_trees: int = 3,
+    seed: int = 42,
+    verbose: bool = True,
+) -> EnsembleTreeModel:
+    """Trains an ensemble of tree models (Parabel/Bonsai-style).
+    Args:
+        y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
+        x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        options (str, optional): The option string passed to liblinear. Defaults to ''.
+        K (int, optional): Maximum degree of nodes in the tree. Defaults to 100.
+        dmax (int, optional): Maximum depth of the tree. Defaults to 10.
+        n_trees (int, optional): Number of trees in the ensemble. Defaults to 3.
+        seed (int, optional): The base random seed for the ensemble. Defaults to 42.
+        verbose (bool, optional): Output extra progress information. Defaults to True.
+
+    Returns:
+        EnsembleTreeModel: An ensemble model which can be used for prediction.
+    """
+    tree_models = []
+    for i in range(n_trees):
+        np.random.seed(seed + i)
+        
+        tree_model = train_tree(y, x, options, K, dmax, verbose=False)
+        tree_models.append(tree_model)
+
+
+
+    if verbose:
+        print("Ensemble training completed.")
+
+    return EnsembleTreeModel(tree_models)
\ No newline at end of file
diff --git a/linear_trainer.py b/linear_trainer.py
index b0524ee7..a664945b 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -6,6 +6,7 @@
 
 import libmultilabel.linear as linear
 from libmultilabel.common_utils import dump_log, is_multiclass_dataset
+from libmultilabel.linear.tree import train_ensemble_tree
 from libmultilabel.linear.utils import LINEAR_TECHNIQUES
 
 
@@ -21,7 +22,7 @@ def linear_test(config, model, datasets, label_mapping):
         scores = []
 
     predict_kwargs = {}
-    if model.name == "tree":
+    if model.name == "tree" or model.name == "ensemble-tree":
         predict_kwargs["beam_width"] = config.beam_width
 
     for i in tqdm(range(ceil(num_instance / config.eval_batch_size))):
@@ -48,13 +49,24 @@ def linear_train(datasets, config):
         if multiclass:
             raise ValueError("Tree model should only be used with multilabel datasets.")
 
-        model = LINEAR_TECHNIQUES[config.linear_technique](
-            datasets["train"]["y"],
-            datasets["train"]["x"],
-            options=config.liblinear_options,
-            K=config.tree_degree,
-            dmax=config.tree_max_depth,
-        )
+        if config.tree_ensemble_models > 1:
+            model = train_ensemble_tree(
+                datasets["train"]["y"],
+                datasets["train"]["x"],
+                options=config.liblinear_options,
+                K=config.tree_degree,
+                dmax=config.tree_max_depth,
+                n_trees=config.tree_ensemble_models,
+                seed=config.seed if config.seed is not None else 42,
+            )
+        else:
+            model = LINEAR_TECHNIQUES[config.linear_technique](
+                datasets["train"]["y"],
+                datasets["train"]["x"],
+                options=config.liblinear_options,
+                K=config.tree_degree,
+                dmax=config.tree_max_depth,
+            )
     else:
         model = LINEAR_TECHNIQUES[config.linear_technique](
             datasets["train"]["y"],
diff --git a/main.py b/main.py
index 12564f6b..86212980 100644
--- a/main.py
+++ b/main.py
@@ -223,6 +223,9 @@ def add_all_arguments(parser):
     parser.add_argument(
         "--tree_max_depth", type=int, default=10, help="Maximum depth of the tree (default: %(default)s)"
     )
+    parser.add_argument(
+        "--tree_ensemble_models", type=int, default=1, help="Number of models in the tree ensemble (default: %(default)s)"
+    )
     parser.add_argument(
         "--beam_width",
         type=int,

From 3c13e37991c70bfcfd0e2528a4ed96dbe8aaa013 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Wed, 16 Jul 2025 20:43:33 +0400
Subject: [PATCH 02/20] revert unnecessary changes

---
 libmultilabel/linear/tree.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index fecb1f45..ba7d6a7f 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -9,9 +9,10 @@
 import sklearn.utils
 from tqdm import tqdm
 import psutil
+
 from . import linear
 from scipy.special import log_expit
-#from sparsekmeans import LloydKmeans, ElkanKmeans
+
 
 __all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"]
 
@@ -71,7 +72,6 @@ def _is_lr(self) -> bool:
 
     def _get_scores(self, pred, parent_score=0.0):
         if self._is_lr():
-            #return parent_score - np.log(1 + np.exp(-pred))
             return parent_score + log_expit(pred)
         else:
             return parent_score - np.square(np.maximum(0, 1 - pred))
@@ -133,11 +133,14 @@ def _separate_model_for_pruning_tree(self):
 
     def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray:
         """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees.
+
         Only subtrees corresponding to the top beam_width candidates from the root are evaluated,
         skipping the rest to avoid unnecessary computation.
+        
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
             beam_width (int): Number of top candidate branches considered for prediction.
+
         Returns:
             np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels).
         """
@@ -206,7 +209,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
             next_level = []
 
         num_labels = len(self.root.label_map)
-        scores = np.full(num_labels, 0.0)
+        scores = np.zeros(num_labels)
         for node, score in cur_level:
             slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
             pred = instance_preds[slice]

From b363dbdad569918f29dfd11cf9be896b50d5b3b9 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Wed, 16 Jul 2025 20:49:37 +0400
Subject: [PATCH 03/20] revert small changes

---
 libmultilabel/linear/tree.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index ba7d6a7f..064dc41b 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -81,7 +81,7 @@ def predict_values(
         x: sparse.csr_matrix,
         beam_width: int = 10,
     ) -> np.ndarray:
-        """Calculates the probability estimates associated with x.
+        """Calculate the probability estimates associated with x.
 
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
@@ -166,8 +166,6 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
         for subtree_idx in range(len(self.root.children)):
             subtree_model = self.subtree_models[subtree_idx]
             instances_mask = mask[:, subtree_idx]
-            if not np.any(instances_mask):
-                continue
             reduced_instances = x[np.s_[instances_mask], :]
 
             # Locate the position of the subtree root in the weight mapping of all nodes

From 024b5444a222e23d4a1d83cc839fcbf004d2af03 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Wed, 16 Jul 2025 20:51:10 +0400
Subject: [PATCH 04/20] remove unnecessary import

---
 libmultilabel/linear/tree.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 064dc41b..66e9e48d 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -6,7 +6,7 @@
 import scipy.sparse as sparse
 import sklearn.cluster
 import sklearn.preprocessing
-import sklearn.utils
+
 from tqdm import tqdm
 import psutil
 

From 6335dfec969f1217cdcab1d96dc9f20da1c4bb76 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Wed, 16 Jul 2025 20:52:15 +0400
Subject: [PATCH 05/20] revert small change

---
 libmultilabel/linear/tree.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 66e9e48d..9590c3ff 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -6,7 +6,6 @@
 import scipy.sparse as sparse
 import sklearn.cluster
 import sklearn.preprocessing
-
 from tqdm import tqdm
 import psutil
 

From 3016397e6a17d154231fb4696cca6967a1ccee83 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Thu, 17 Jul 2025 16:25:19 +0400
Subject: [PATCH 06/20] applied black

---
 libmultilabel/linear/tree.py | 38 ++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 9590c3ff..3aecb67c 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -57,7 +57,7 @@ def __init__(
         self.node_ptr = node_ptr
         self.options = options
         self.multiclass = False
-        self._model_separated = False # Indicates whether the model has been separated for pruning tree.
+        self._model_separated = False  # Indicates whether the model has been separated for pruning tree.
 
     def _is_lr(self) -> bool:
         options = self.options or ""
@@ -92,13 +92,17 @@ def predict_values(
         if beam_width >= len(self.root.children):
             # Beam_width is sufficiently large; pruning not applied.
             # Calculates decision values for all nodes.
-            all_preds = linear.predict_values(self.flat_model, x) # number of instances * (number of labels + total number of metalabels)
+            all_preds = linear.predict_values(
+                self.flat_model, x
+            )  # number of instances * (number of labels + total number of metalabels)
         else:
             # Beam_width is small; pruning applied to reduce computation.
             if not self._model_separated:
                 self._separate_model_for_pruning_tree()
                 self._model_separated = True
-            all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels)
+            all_preds = self._prune_tree_and_predict_values(
+                x, beam_width
+            )  # number of instances * (number of labels + total number of metalabels)
         return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])])
 
     def _separate_model_for_pruning_tree(self):
@@ -106,27 +110,21 @@ def _separate_model_for_pruning_tree(self):
         This function separates the weights for the root node and its children into (K+1) FlatModel
         for efficient beam search traversal in Python.
         """
-        tree_flat_model_params = {
-            'bias': self.root.model.bias,
-            'thresholds': 0,
-            'multiclass': False
-        }
+        tree_flat_model_params = {"bias": self.root.model.bias, "thresholds": 0, "multiclass": False}
         slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]]
         self.root_model = linear.FlatModel(
-            name="root-flattened-tree",
-            weights=self.flat_model.weights[slice].tocsr(),
-            **tree_flat_model_params
+            name="root-flattened-tree", weights=self.flat_model.weights[slice].tocsr(), **tree_flat_model_params
         )
 
         self.subtree_models = []
         for i in range(len(self.root.children)):
             subtree_weights_start = self.node_ptr[self.root.children[i].index]
-            subtree_weights_end = self.node_ptr[self.root.children[i+1].index] if i+1 < len(self.root.children) else -1
+            subtree_weights_end = (
+                self.node_ptr[self.root.children[i + 1].index] if i + 1 < len(self.root.children) else -1
+            )
             slice = np.s_[:, subtree_weights_start:subtree_weights_end]
             subtree_flatmodel = linear.FlatModel(
-                name="subtree-flattened-tree",
-                weights=self.flat_model.weights[slice].tocsr(),
-                **tree_flat_model_params
+                name="subtree-flattened-tree", weights=self.flat_model.weights[slice].tocsr(), **tree_flat_model_params
             )
             self.subtree_models.append(subtree_flatmodel)
 
@@ -135,7 +133,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         Only subtrees corresponding to the top beam_width candidates from the root are evaluated,
         skipping the rest to avoid unnecessary computation.
-        
+
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
             beam_width (int): Number of top candidate branches considered for prediction.
@@ -160,7 +158,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
         # Build a mask where mask[i, j] is True if the j-th subtree is among the top beam_width subtrees for the i-th instance
         mask = np.zeros_like(children_scores, dtype=np.bool_)
         np.put_along_axis(mask, top_beam_width_indices, True, axis=1)
-        
+
         # Calculate predictions for each subtree with its corresponding instances
         for subtree_idx in range(len(self.root.children)):
             subtree_model = self.subtree_models[subtree_idx]
@@ -459,13 +457,11 @@ def train_ensemble_tree(
     tree_models = []
     for i in range(n_trees):
         np.random.seed(seed + i)
-        
+
         tree_model = train_tree(y, x, options, K, dmax, verbose=False)
         tree_models.append(tree_model)
 
-
-
     if verbose:
         print("Ensemble training completed.")
 
-    return EnsembleTreeModel(tree_models)
\ No newline at end of file
+    return EnsembleTreeModel(tree_models)

From 85c2e196c05684f88975455792dbcf837b4e7a42 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Mon, 21 Jul 2025 17:30:23 +0400
Subject: [PATCH 07/20] fixed all

---
 libmultilabel/linear/tree.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 3aecb67c..017e28fc 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -69,7 +69,7 @@ def _is_lr(self) -> bool:
                 return solver_type in ["0", "6", "7"]
         return False
 
-    def _get_scores(self, pred, parent_score=0.0):
+    def _get_scores(self, pred: np.ndarray, parent_score: float = 0.0) -> np.ndarray:
         if self._is_lr():
             return parent_score + log_expit(pred)
         else:

From c708199fe24dcb2c3a43d9cb14e128b44f277a4a Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Mon, 14 Jul 2025 18:33:45 +0400
Subject: [PATCH 08/20] feat(linear): Add ensemble tree model and solver-aware
 scoring

---
 libmultilabel/linear/tree.py | 111 +++++++++++++++++++++++++++++++----
 linear_trainer.py            |  28 ++++++---
 main.py                      |   3 +
 3 files changed, 122 insertions(+), 20 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index fe6e94b4..fecb1f45 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -6,12 +6,14 @@
 import scipy.sparse as sparse
 import sklearn.cluster
 import sklearn.preprocessing
+import sklearn.utils
 from tqdm import tqdm
 import psutil
-
 from . import linear
+from scipy.special import log_expit
+#from sparsekmeans import LloydKmeans, ElkanKmeans
 
-__all__ = ["train_tree", "TreeModel"]
+__all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"]
 
 
 class Node:
@@ -47,20 +49,39 @@ def __init__(
         root: Node,
         flat_model: linear.FlatModel,
         node_ptr: np.ndarray,
+        options: str,
     ):
         self.name = "tree"
         self.root = root
         self.flat_model = flat_model
         self.node_ptr = node_ptr
+        self.options = options
         self.multiclass = False
         self._model_separated = False # Indicates whether the model has been separated for pruning tree.
 
+    def _is_lr(self) -> bool:
+        options = self.options or ""
+        options_split = options.split()
+        if "-s" in options_split:
+            i = options_split.index("-s")
+            if i + 1 < len(options_split):
+                solver_type = options_split[i + 1]
+                return solver_type in ["0", "6", "7"]
+        return False
+
+    def _get_scores(self, pred, parent_score=0.0):
+        if self._is_lr():
+            #return parent_score - np.log(1 + np.exp(-pred))
+            return parent_score + log_expit(pred)
+        else:
+            return parent_score - np.square(np.maximum(0, 1 - pred))
+
     def predict_values(
         self,
         x: sparse.csr_matrix,
         beam_width: int = 10,
     ) -> np.ndarray:
-        """Calculate the probability estimates associated with x.
+        """Calculates the probability estimates associated with x.
 
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
@@ -109,17 +130,14 @@ def _separate_model_for_pruning_tree(self):
                 **tree_flat_model_params
             )
             self.subtree_models.append(subtree_flatmodel)
-        
+
     def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray:
         """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees.
-
         Only subtrees corresponding to the top beam_width candidates from the root are evaluated,
         skipping the rest to avoid unnecessary computation.
-
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
             beam_width (int): Number of top candidate branches considered for prediction.
-
         Returns:
             np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels).
         """
@@ -129,7 +147,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         # Calculate root decision values and scores
         root_preds = linear.predict_values(self.root_model, x)
-        children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds))
+        children_scores = self._get_scores(root_preds)
 
         slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]]
         all_preds[slice] = root_preds
@@ -145,6 +163,8 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
         for subtree_idx in range(len(self.root.children)):
             subtree_model = self.subtree_models[subtree_idx]
             instances_mask = mask[:, subtree_idx]
+            if not np.any(instances_mask):
+                continue
             reduced_instances = x[np.s_[instances_mask], :]
 
             # Locate the position of the subtree root in the weight mapping of all nodes
@@ -179,18 +199,18 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
                     continue
                 slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
                 pred = instance_preds[slice]
-                children_score = score - np.square(np.maximum(0, 1 - pred))
+                children_score = self._get_scores(pred, score)
                 next_level.extend(zip(node.children, children_score.tolist()))
 
             cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width]
             next_level = []
 
         num_labels = len(self.root.label_map)
-        scores = np.zeros(num_labels)
+        scores = np.full(num_labels, 0.0)
         for node, score in cur_level:
             slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
             pred = instance_preds[slice]
-            scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred)))
+            scores[node.label_map] = np.exp(self._get_scores(pred, score))
         return scores
 
 
@@ -258,7 +278,7 @@ def visit(node):
     pbar.close()
 
     flat_model, node_ptr = _flatten_model(root)
-    return TreeModel(root, flat_model, node_ptr)
+    return TreeModel(root, flat_model, node_ptr, options)
 
 
 def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int, K: int, dmax: int) -> Node:
@@ -382,3 +402,70 @@ def visit(node):
     node_ptr = np.cumsum([0] + list(map(lambda w: w.shape[1], weights)))
 
     return model, node_ptr
+
+
+class EnsembleTreeModel:
+    """An ensemble of tree models.
+    The ensemble aggregates predictions from multiple trees to improve accuracy and robustness.
+    """
+
+    def __init__(self, tree_models: list[TreeModel]):
+        """
+        Args:
+            tree_models (list[TreeModel]): A list of trained tree models.
+        """
+        self.name = "ensemble-tree"
+        self.tree_models = tree_models
+        self.multiclass = False
+
+    def predict_values(self, x: sparse.csr_matrix, beam_width: int = 10) -> np.ndarray:
+        """Calculates the averaged probability estimates from all trees in the ensemble.
+
+        Args:
+            x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
+            beam_width (int, optional): Number of candidates considered during beam search for each tree. Defaults to 10.
+
+        Returns:
+            np.ndarray: A matrix with dimension number of instances * number of classes, containing averaged scores.
+        """
+        all_predictions = [model.predict_values(x, beam_width) for model in self.tree_models]
+        return np.mean(all_predictions, axis=0)
+
+
+def train_ensemble_tree(
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    options: str = "",
+    K: int = 100,
+    dmax: int = 10,
+    n_trees: int = 3,
+    seed: int = 42,
+    verbose: bool = True,
+) -> EnsembleTreeModel:
+    """Trains an ensemble of tree models (Parabel/Bonsai-style).
+    Args:
+        y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
+        x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        options (str, optional): The option string passed to liblinear. Defaults to ''.
+        K (int, optional): Maximum degree of nodes in the tree. Defaults to 100.
+        dmax (int, optional): Maximum depth of the tree. Defaults to 10.
+        n_trees (int, optional): Number of trees in the ensemble. Defaults to 3.
+        seed (int, optional): The base random seed for the ensemble. Defaults to 42.
+        verbose (bool, optional): Output extra progress information. Defaults to True.
+
+    Returns:
+        EnsembleTreeModel: An ensemble model which can be used for prediction.
+    """
+    tree_models = []
+    for i in range(n_trees):
+        np.random.seed(seed + i)
+        
+        tree_model = train_tree(y, x, options, K, dmax, verbose=False)
+        tree_models.append(tree_model)
+
+
+
+    if verbose:
+        print("Ensemble training completed.")
+
+    return EnsembleTreeModel(tree_models)
\ No newline at end of file
diff --git a/linear_trainer.py b/linear_trainer.py
index b0524ee7..a664945b 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -6,6 +6,7 @@
 
 import libmultilabel.linear as linear
 from libmultilabel.common_utils import dump_log, is_multiclass_dataset
+from libmultilabel.linear.tree import train_ensemble_tree
 from libmultilabel.linear.utils import LINEAR_TECHNIQUES
 
 
@@ -21,7 +22,7 @@ def linear_test(config, model, datasets, label_mapping):
         scores = []
 
     predict_kwargs = {}
-    if model.name == "tree":
+    if model.name == "tree" or model.name == "ensemble-tree":
         predict_kwargs["beam_width"] = config.beam_width
 
     for i in tqdm(range(ceil(num_instance / config.eval_batch_size))):
@@ -48,13 +49,24 @@ def linear_train(datasets, config):
         if multiclass:
             raise ValueError("Tree model should only be used with multilabel datasets.")
 
-        model = LINEAR_TECHNIQUES[config.linear_technique](
-            datasets["train"]["y"],
-            datasets["train"]["x"],
-            options=config.liblinear_options,
-            K=config.tree_degree,
-            dmax=config.tree_max_depth,
-        )
+        if config.tree_ensemble_models > 1:
+            model = train_ensemble_tree(
+                datasets["train"]["y"],
+                datasets["train"]["x"],
+                options=config.liblinear_options,
+                K=config.tree_degree,
+                dmax=config.tree_max_depth,
+                n_trees=config.tree_ensemble_models,
+                seed=config.seed if config.seed is not None else 42,
+            )
+        else:
+            model = LINEAR_TECHNIQUES[config.linear_technique](
+                datasets["train"]["y"],
+                datasets["train"]["x"],
+                options=config.liblinear_options,
+                K=config.tree_degree,
+                dmax=config.tree_max_depth,
+            )
     else:
         model = LINEAR_TECHNIQUES[config.linear_technique](
             datasets["train"]["y"],
diff --git a/main.py b/main.py
index 12564f6b..86212980 100644
--- a/main.py
+++ b/main.py
@@ -223,6 +223,9 @@ def add_all_arguments(parser):
     parser.add_argument(
         "--tree_max_depth", type=int, default=10, help="Maximum depth of the tree (default: %(default)s)"
     )
+    parser.add_argument(
+        "--tree_ensemble_models", type=int, default=1, help="Number of models in the tree ensemble (default: %(default)s)"
+    )
     parser.add_argument(
         "--beam_width",
         type=int,

From 0dc778c419ce39fcc3ef2c2ccefe01ef51416143 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Wed, 16 Jul 2025 20:43:33 +0400
Subject: [PATCH 09/20] revert unnecessary changes

---
 libmultilabel/linear/tree.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index fecb1f45..ba7d6a7f 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -9,9 +9,10 @@
 import sklearn.utils
 from tqdm import tqdm
 import psutil
+
 from . import linear
 from scipy.special import log_expit
-#from sparsekmeans import LloydKmeans, ElkanKmeans
+
 
 __all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"]
 
@@ -71,7 +72,6 @@ def _is_lr(self) -> bool:
 
     def _get_scores(self, pred, parent_score=0.0):
         if self._is_lr():
-            #return parent_score - np.log(1 + np.exp(-pred))
             return parent_score + log_expit(pred)
         else:
             return parent_score - np.square(np.maximum(0, 1 - pred))
@@ -133,11 +133,14 @@ def _separate_model_for_pruning_tree(self):
 
     def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray:
         """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees.
+
         Only subtrees corresponding to the top beam_width candidates from the root are evaluated,
         skipping the rest to avoid unnecessary computation.
+        
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
             beam_width (int): Number of top candidate branches considered for prediction.
+
         Returns:
             np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels).
         """
@@ -206,7 +209,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
             next_level = []
 
         num_labels = len(self.root.label_map)
-        scores = np.full(num_labels, 0.0)
+        scores = np.zeros(num_labels)
         for node, score in cur_level:
             slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
             pred = instance_preds[slice]

From 516f9e005e0062090bf5aebe4377fea6a80c827d Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Wed, 16 Jul 2025 20:49:37 +0400
Subject: [PATCH 10/20] revert small changes

---
 libmultilabel/linear/tree.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index ba7d6a7f..064dc41b 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -81,7 +81,7 @@ def predict_values(
         x: sparse.csr_matrix,
         beam_width: int = 10,
     ) -> np.ndarray:
-        """Calculates the probability estimates associated with x.
+        """Calculate the probability estimates associated with x.
 
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
@@ -166,8 +166,6 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
         for subtree_idx in range(len(self.root.children)):
             subtree_model = self.subtree_models[subtree_idx]
             instances_mask = mask[:, subtree_idx]
-            if not np.any(instances_mask):
-                continue
             reduced_instances = x[np.s_[instances_mask], :]
 
             # Locate the position of the subtree root in the weight mapping of all nodes

From e60075f385ca31e7c783ac572d3035d7fd50d6fe Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Wed, 16 Jul 2025 20:51:10 +0400
Subject: [PATCH 11/20] remove unnecessary import

---
 libmultilabel/linear/tree.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 064dc41b..66e9e48d 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -6,7 +6,7 @@
 import scipy.sparse as sparse
 import sklearn.cluster
 import sklearn.preprocessing
-import sklearn.utils
+
 from tqdm import tqdm
 import psutil
 

From 2ea17917b33bd4ae850d06a291f59087ae36e8b6 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Wed, 16 Jul 2025 20:52:15 +0400
Subject: [PATCH 12/20] revert small change

---
 libmultilabel/linear/tree.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 66e9e48d..9590c3ff 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -6,7 +6,6 @@
 import scipy.sparse as sparse
 import sklearn.cluster
 import sklearn.preprocessing
-
 from tqdm import tqdm
 import psutil
 

From 05bfb71b710fdb4d83d5679fd728059b90833703 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Thu, 17 Jul 2025 16:25:19 +0400
Subject: [PATCH 13/20] applied black

---
 libmultilabel/linear/tree.py | 38 ++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 9590c3ff..3aecb67c 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -57,7 +57,7 @@ def __init__(
         self.node_ptr = node_ptr
         self.options = options
         self.multiclass = False
-        self._model_separated = False # Indicates whether the model has been separated for pruning tree.
+        self._model_separated = False  # Indicates whether the model has been separated for pruning tree.
 
     def _is_lr(self) -> bool:
         options = self.options or ""
@@ -92,13 +92,17 @@ def predict_values(
         if beam_width >= len(self.root.children):
             # Beam_width is sufficiently large; pruning not applied.
             # Calculates decision values for all nodes.
-            all_preds = linear.predict_values(self.flat_model, x) # number of instances * (number of labels + total number of metalabels)
+            all_preds = linear.predict_values(
+                self.flat_model, x
+            )  # number of instances * (number of labels + total number of metalabels)
         else:
             # Beam_width is small; pruning applied to reduce computation.
             if not self._model_separated:
                 self._separate_model_for_pruning_tree()
                 self._model_separated = True
-            all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels)
+            all_preds = self._prune_tree_and_predict_values(
+                x, beam_width
+            )  # number of instances * (number of labels + total number of metalabels)
         return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])])
 
     def _separate_model_for_pruning_tree(self):
@@ -106,27 +110,21 @@ def _separate_model_for_pruning_tree(self):
         This function separates the weights for the root node and its children into (K+1) FlatModel
         for efficient beam search traversal in Python.
         """
-        tree_flat_model_params = {
-            'bias': self.root.model.bias,
-            'thresholds': 0,
-            'multiclass': False
-        }
+        tree_flat_model_params = {"bias": self.root.model.bias, "thresholds": 0, "multiclass": False}
         slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]]
         self.root_model = linear.FlatModel(
-            name="root-flattened-tree",
-            weights=self.flat_model.weights[slice].tocsr(),
-            **tree_flat_model_params
+            name="root-flattened-tree", weights=self.flat_model.weights[slice].tocsr(), **tree_flat_model_params
         )
 
         self.subtree_models = []
         for i in range(len(self.root.children)):
             subtree_weights_start = self.node_ptr[self.root.children[i].index]
-            subtree_weights_end = self.node_ptr[self.root.children[i+1].index] if i+1 < len(self.root.children) else -1
+            subtree_weights_end = (
+                self.node_ptr[self.root.children[i + 1].index] if i + 1 < len(self.root.children) else -1
+            )
             slice = np.s_[:, subtree_weights_start:subtree_weights_end]
             subtree_flatmodel = linear.FlatModel(
-                name="subtree-flattened-tree",
-                weights=self.flat_model.weights[slice].tocsr(),
-                **tree_flat_model_params
+                name="subtree-flattened-tree", weights=self.flat_model.weights[slice].tocsr(), **tree_flat_model_params
             )
             self.subtree_models.append(subtree_flatmodel)
 
@@ -135,7 +133,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         Only subtrees corresponding to the top beam_width candidates from the root are evaluated,
         skipping the rest to avoid unnecessary computation.
-        
+
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
             beam_width (int): Number of top candidate branches considered for prediction.
@@ -160,7 +158,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
         # Build a mask where mask[i, j] is True if the j-th subtree is among the top beam_width subtrees for the i-th instance
         mask = np.zeros_like(children_scores, dtype=np.bool_)
         np.put_along_axis(mask, top_beam_width_indices, True, axis=1)
-        
+
         # Calculate predictions for each subtree with its corresponding instances
         for subtree_idx in range(len(self.root.children)):
             subtree_model = self.subtree_models[subtree_idx]
@@ -459,13 +457,11 @@ def train_ensemble_tree(
     tree_models = []
     for i in range(n_trees):
         np.random.seed(seed + i)
-        
+
         tree_model = train_tree(y, x, options, K, dmax, verbose=False)
         tree_models.append(tree_model)
 
-
-
     if verbose:
         print("Ensemble training completed.")
 
-    return EnsembleTreeModel(tree_models)
\ No newline at end of file
+    return EnsembleTreeModel(tree_models)

From 771eff520136e07e3542141233ed72255b67a5e2 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Mon, 21 Jul 2025 17:30:23 +0400
Subject: [PATCH 14/20] fixed all

---
 libmultilabel/linear/tree.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 3aecb67c..017e28fc 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -69,7 +69,7 @@ def _is_lr(self) -> bool:
                 return solver_type in ["0", "6", "7"]
         return False
 
-    def _get_scores(self, pred, parent_score=0.0):
+    def _get_scores(self, pred: np.ndarray, parent_score: float = 0.0) -> np.ndarray:
         if self._is_lr():
             return parent_score + log_expit(pred)
         else:

From ab9cde0f558aa25f31491cca9104794326e90e07 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Wed, 23 Jul 2025 11:34:06 +0400
Subject: [PATCH 15/20] keep the ensemble implementation only and remove the
 scoring aware

---
 libmultilabel/linear/tree.py | 60 +++++++++++++-----------------------
 1 file changed, 21 insertions(+), 39 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 017e28fc..d630ff24 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -10,8 +10,6 @@
 import psutil
 
 from . import linear
-from scipy.special import log_expit
-
 
 __all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"]
 
@@ -49,31 +47,13 @@ def __init__(
         root: Node,
         flat_model: linear.FlatModel,
         node_ptr: np.ndarray,
-        options: str,
     ):
         self.name = "tree"
         self.root = root
         self.flat_model = flat_model
         self.node_ptr = node_ptr
-        self.options = options
         self.multiclass = False
-        self._model_separated = False  # Indicates whether the model has been separated for pruning tree.
-
-    def _is_lr(self) -> bool:
-        options = self.options or ""
-        options_split = options.split()
-        if "-s" in options_split:
-            i = options_split.index("-s")
-            if i + 1 < len(options_split):
-                solver_type = options_split[i + 1]
-                return solver_type in ["0", "6", "7"]
-        return False
-
-    def _get_scores(self, pred: np.ndarray, parent_score: float = 0.0) -> np.ndarray:
-        if self._is_lr():
-            return parent_score + log_expit(pred)
-        else:
-            return parent_score - np.square(np.maximum(0, 1 - pred))
+        self._model_separated = False # Indicates whether the model has been separated for pruning tree.
 
     def predict_values(
         self,
@@ -92,17 +72,13 @@ def predict_values(
         if beam_width >= len(self.root.children):
             # Beam_width is sufficiently large; pruning not applied.
             # Calculates decision values for all nodes.
-            all_preds = linear.predict_values(
-                self.flat_model, x
-            )  # number of instances * (number of labels + total number of metalabels)
+            all_preds = linear.predict_values(self.flat_model, x) # number of instances * (number of labels + total number of metalabels)
         else:
             # Beam_width is small; pruning applied to reduce computation.
             if not self._model_separated:
                 self._separate_model_for_pruning_tree()
                 self._model_separated = True
-            all_preds = self._prune_tree_and_predict_values(
-                x, beam_width
-            )  # number of instances * (number of labels + total number of metalabels)
+            all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels)
         return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])])
 
     def _separate_model_for_pruning_tree(self):
@@ -110,24 +86,30 @@ def _separate_model_for_pruning_tree(self):
         This function separates the weights for the root node and its children into (K+1) FlatModel
         for efficient beam search traversal in Python.
         """
-        tree_flat_model_params = {"bias": self.root.model.bias, "thresholds": 0, "multiclass": False}
+        tree_flat_model_params = {
+            'bias': self.root.model.bias,
+            'thresholds': 0,
+            'multiclass': False
+        }
         slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]]
         self.root_model = linear.FlatModel(
-            name="root-flattened-tree", weights=self.flat_model.weights[slice].tocsr(), **tree_flat_model_params
+            name="root-flattened-tree",
+            weights=self.flat_model.weights[slice].tocsr(),
+            **tree_flat_model_params
         )
 
         self.subtree_models = []
         for i in range(len(self.root.children)):
             subtree_weights_start = self.node_ptr[self.root.children[i].index]
-            subtree_weights_end = (
-                self.node_ptr[self.root.children[i + 1].index] if i + 1 < len(self.root.children) else -1
-            )
+            subtree_weights_end = self.node_ptr[self.root.children[i+1].index] if i+1 < len(self.root.children) else -1
             slice = np.s_[:, subtree_weights_start:subtree_weights_end]
             subtree_flatmodel = linear.FlatModel(
-                name="subtree-flattened-tree", weights=self.flat_model.weights[slice].tocsr(), **tree_flat_model_params
+                name="subtree-flattened-tree",
+                weights=self.flat_model.weights[slice].tocsr(),
+                **tree_flat_model_params
             )
             self.subtree_models.append(subtree_flatmodel)
-
+        
     def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray:
         """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees.
 
@@ -147,7 +129,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         # Calculate root decision values and scores
         root_preds = linear.predict_values(self.root_model, x)
-        children_scores = self._get_scores(root_preds)
+        children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds))
 
         slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]]
         all_preds[slice] = root_preds
@@ -158,7 +140,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
         # Build a mask where mask[i, j] is True if the j-th subtree is among the top beam_width subtrees for the i-th instance
         mask = np.zeros_like(children_scores, dtype=np.bool_)
         np.put_along_axis(mask, top_beam_width_indices, True, axis=1)
-
+        
         # Calculate predictions for each subtree with its corresponding instances
         for subtree_idx in range(len(self.root.children)):
             subtree_model = self.subtree_models[subtree_idx]
@@ -197,7 +179,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
                     continue
                 slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
                 pred = instance_preds[slice]
-                children_score = self._get_scores(pred, score)
+                children_score = score - np.square(np.maximum(0, 1 - pred))
                 next_level.extend(zip(node.children, children_score.tolist()))
 
             cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width]
@@ -208,7 +190,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
         for node, score in cur_level:
             slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
             pred = instance_preds[slice]
-            scores[node.label_map] = np.exp(self._get_scores(pred, score))
+            scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred)))
         return scores
 
 
@@ -276,7 +258,7 @@ def visit(node):
     pbar.close()
 
     flat_model, node_ptr = _flatten_model(root)
-    return TreeModel(root, flat_model, node_ptr, options)
+    return TreeModel(root, flat_model, node_ptr)
 
 
 def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int, K: int, dmax: int) -> Node:

From 5419063986aa441a50e33830652c4a23f2557988 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Wed, 23 Jul 2025 13:54:09 +0400
Subject: [PATCH 16/20] resolve SW's comment

---
 linear_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/linear_trainer.py b/linear_trainer.py
index a664945b..8fbf699a 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -6,7 +6,7 @@
 
 import libmultilabel.linear as linear
 from libmultilabel.common_utils import dump_log, is_multiclass_dataset
-from libmultilabel.linear.tree import train_ensemble_tree
+from libmultilabel.linear.tree import EnsembleTreeModel, TreeModel, train_ensemble_tree
 from libmultilabel.linear.utils import LINEAR_TECHNIQUES
 
 
@@ -22,7 +22,7 @@ def linear_test(config, model, datasets, label_mapping):
         scores = []
 
     predict_kwargs = {}
-    if model.name == "tree" or model.name == "ensemble-tree":
+    if isinstance(model, (TreeModel, EnsembleTreeModel)):
         predict_kwargs["beam_width"] = config.beam_width
 
     for i in tqdm(range(ceil(num_instance / config.eval_batch_size))):

From 24f9a6be5be029d6b9a385615e59f2c1737b8a7d Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Sun, 3 Aug 2025 17:18:24 +0400
Subject: [PATCH 17/20] making dmax and K as global variables

---
 libmultilabel/linear/tree.py | 23 ++++++++---------------
 linear_trainer.py            |  7 ++-----
 2 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index d630ff24..98f95ceb 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -13,6 +13,9 @@
 
 __all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"]
 
+K = 100
+DMAX = 10
+
 
 class Node:
     def __init__(
@@ -198,8 +201,6 @@ def train_tree(
     y: sparse.csr_matrix,
     x: sparse.csr_matrix,
     options: str = "",
-    K=100,
-    dmax=10,
     verbose: bool = True,
 ) -> TreeModel:
     """Train a linear model for multi-label data using a divide-and-conquer strategy.
@@ -209,8 +210,6 @@ def train_tree(
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
         options (str): The option string passed to liblinear.
-        K (int, optional): Maximum degree of nodes in the tree. Defaults to 100.
-        dmax (int, optional): Maximum depth of the tree. Defaults to 10.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
     Returns:
@@ -218,7 +217,7 @@ def train_tree(
     """
     label_representation = (y.T * x).tocsr()
     label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
-    root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax)
+    root = _build_tree(label_representation, np.arange(y.shape[1]), 0)
     root.is_root = True
 
     num_nodes = 0
@@ -261,20 +260,18 @@ def visit(node):
     return TreeModel(root, flat_model, node_ptr)
 
 
-def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int, K: int, dmax: int) -> Node:
+def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int) -> Node:
     """Build the tree recursively by kmeans clustering.
 
     Args:
         label_representation (sparse.csr_matrix): A matrix with dimensions number of classes under this node * number of features.
         label_map (np.ndarray): Maps 0..label_representation.shape[0] to the original label indices.
         d (int): Current depth.
-        K (int): Maximum degree of nodes in the tree.
-        dmax (int): Maximum depth of the tree.
 
     Returns:
         Node: Root of the (sub)tree built from label_representation.
     """
-    if d >= dmax or label_representation.shape[0] <= K:
+    if d >= DMAX or label_representation.shape[0] <= K:
         return Node(label_map=label_map, children=[])
 
     metalabels = (
@@ -294,7 +291,7 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray,
     for i in range(K):
         child_representation = label_representation[metalabels == i]
         child_map = label_map[metalabels == i]
-        child = _build_tree(child_representation, child_map, d + 1, K, dmax)
+        child = _build_tree(child_representation, child_map, d + 1)
         children.append(child)
 
     return Node(label_map=label_map, children=children)
@@ -416,8 +413,6 @@ def train_ensemble_tree(
     y: sparse.csr_matrix,
     x: sparse.csr_matrix,
     options: str = "",
-    K: int = 100,
-    dmax: int = 10,
     n_trees: int = 3,
     seed: int = 42,
     verbose: bool = True,
@@ -427,8 +422,6 @@ def train_ensemble_tree(
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
-        K (int, optional): Maximum degree of nodes in the tree. Defaults to 100.
-        dmax (int, optional): Maximum depth of the tree. Defaults to 10.
         n_trees (int, optional): Number of trees in the ensemble. Defaults to 3.
         seed (int, optional): The base random seed for the ensemble. Defaults to 42.
         verbose (bool, optional): Output extra progress information. Defaults to True.
@@ -440,7 +433,7 @@ def train_ensemble_tree(
     for i in range(n_trees):
         np.random.seed(seed + i)
 
-        tree_model = train_tree(y, x, options, K, dmax, verbose=False)
+        tree_model = train_tree(y, x, options, verbose=False)
         tree_models.append(tree_model)
 
     if verbose:
diff --git a/linear_trainer.py b/linear_trainer.py
index 8fbf699a..f8a46a2f 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -48,14 +48,13 @@ def linear_train(datasets, config):
     if config.linear_technique == "tree":
         if multiclass:
             raise ValueError("Tree model should only be used with multilabel datasets.")
-
+        linear.tree.K = config.tree_degree
+        linear.tree.DMAX = config.tree_max_depth
         if config.tree_ensemble_models > 1:
             model = train_ensemble_tree(
                 datasets["train"]["y"],
                 datasets["train"]["x"],
                 options=config.liblinear_options,
-                K=config.tree_degree,
-                dmax=config.tree_max_depth,
                 n_trees=config.tree_ensemble_models,
                 seed=config.seed if config.seed is not None else 42,
             )
@@ -64,8 +63,6 @@ def linear_train(datasets, config):
                 datasets["train"]["y"],
                 datasets["train"]["x"],
                 options=config.liblinear_options,
-                K=config.tree_degree,
-                dmax=config.tree_max_depth,
             )
     else:
         model = LINEAR_TECHNIQUES[config.linear_technique](

From 0669c9fce4fd1b45c636610a18aeb7968c92b96b Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Mon, 4 Aug 2025 11:21:09 +0400
Subject: [PATCH 18/20] making dmax and K as global default value

---
 libmultilabel/linear/tree.py | 27 ++++++++++++++++++---------
 linear_trainer.py            |  7 +++++--
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 98f95ceb..f1fa7ec2 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -13,8 +13,8 @@
 
 __all__ = ["train_tree", "TreeModel", "train_ensemble_tree", "EnsembleTreeModel"]
 
-K = 100
-DMAX = 10
+DEFAULT_K = 100
+DEFAULT_DMAX = 10
 
 
 class Node:
@@ -201,6 +201,8 @@ def train_tree(
     y: sparse.csr_matrix,
     x: sparse.csr_matrix,
     options: str = "",
+    K=DEFAULT_K,
+    dmax=DEFAULT_DMAX,
     verbose: bool = True,
 ) -> TreeModel:
     """Train a linear model for multi-label data using a divide-and-conquer strategy.
@@ -210,6 +212,8 @@ def train_tree(
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
         options (str): The option string passed to liblinear.
+        K (int, optional): Maximum degree of nodes in the tree. Defaults to 100.
+        dmax (int, optional): Maximum depth of the tree. Defaults to 10.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
     Returns:
@@ -217,7 +221,7 @@ def train_tree(
     """
     label_representation = (y.T * x).tocsr()
     label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
-    root = _build_tree(label_representation, np.arange(y.shape[1]), 0)
+    root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax)
     root.is_root = True
 
     num_nodes = 0
@@ -260,18 +264,20 @@ def visit(node):
     return TreeModel(root, flat_model, node_ptr)
 
 
-def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int) -> Node:
+def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int, K: int, dmax: int) -> Node:
     """Build the tree recursively by kmeans clustering.
 
     Args:
         label_representation (sparse.csr_matrix): A matrix with dimensions number of classes under this node * number of features.
         label_map (np.ndarray): Maps 0..label_representation.shape[0] to the original label indices.
         d (int): Current depth.
+        K (int): Maximum degree of nodes in the tree.
+        dmax (int): Maximum depth of the tree.
 
     Returns:
         Node: Root of the (sub)tree built from label_representation.
     """
-    if d >= DMAX or label_representation.shape[0] <= K:
+    if d >= dmax or label_representation.shape[0] <= K:
         return Node(label_map=label_map, children=[])
 
     metalabels = (
@@ -291,7 +297,7 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray,
     for i in range(K):
         child_representation = label_representation[metalabels == i]
         child_map = label_map[metalabels == i]
-        child = _build_tree(child_representation, child_map, d + 1)
+        child = _build_tree(child_representation, child_map, d + 1, K, dmax)
         children.append(child)
 
     return Node(label_map=label_map, children=children)
@@ -413,6 +419,8 @@ def train_ensemble_tree(
     y: sparse.csr_matrix,
     x: sparse.csr_matrix,
     options: str = "",
+    K: int = DEFAULT_K,
+    dmax: int = DEFAULT_DMAX,
     n_trees: int = 3,
     seed: int = 42,
     verbose: bool = True,
@@ -422,6 +430,8 @@ def train_ensemble_tree(
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
+        K (int, optional): Maximum degree of nodes in the tree. Defaults to 100.
+        dmax (int, optional): Maximum depth of the tree. Defaults to 10.
         n_trees (int, optional): Number of trees in the ensemble. Defaults to 3.
         seed (int, optional): The base random seed for the ensemble. Defaults to 42.
         verbose (bool, optional): Output extra progress information. Defaults to True.
@@ -433,10 +443,9 @@ def train_ensemble_tree(
     for i in range(n_trees):
         np.random.seed(seed + i)
 
-        tree_model = train_tree(y, x, options, verbose=False)
+        tree_model = train_tree(y, x, options, K, dmax, verbose)
         tree_models.append(tree_model)
 
-    if verbose:
-        print("Ensemble training completed.")
+    print("Ensemble training completed.")
 
     return EnsembleTreeModel(tree_models)
diff --git a/linear_trainer.py b/linear_trainer.py
index f8a46a2f..8fbf699a 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -48,13 +48,14 @@ def linear_train(datasets, config):
     if config.linear_technique == "tree":
         if multiclass:
             raise ValueError("Tree model should only be used with multilabel datasets.")
-        linear.tree.K = config.tree_degree
-        linear.tree.DMAX = config.tree_max_depth
+
         if config.tree_ensemble_models > 1:
             model = train_ensemble_tree(
                 datasets["train"]["y"],
                 datasets["train"]["x"],
                 options=config.liblinear_options,
+                K=config.tree_degree,
+                dmax=config.tree_max_depth,
                 n_trees=config.tree_ensemble_models,
                 seed=config.seed if config.seed is not None else 42,
             )
@@ -63,6 +64,8 @@ def linear_train(datasets, config):
                 datasets["train"]["y"],
                 datasets["train"]["x"],
                 options=config.liblinear_options,
+                K=config.tree_degree,
+                dmax=config.tree_max_depth,
             )
     else:
         model = LINEAR_TECHNIQUES[config.linear_technique](

From 65521b9f92e2debaadc33e57b6c78aaa29f0ffda Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Tue, 5 Aug 2025 11:52:21 +0400
Subject: [PATCH 19/20] adding the tutorial tree webpage

---
 docs/examples/plot_linear_tree_tutorial.py | 113 ++++++++++++++++++---
 1 file changed, 97 insertions(+), 16 deletions(-)

diff --git a/docs/examples/plot_linear_tree_tutorial.py b/docs/examples/plot_linear_tree_tutorial.py
index 846ae88a..d0c70318 100644
--- a/docs/examples/plot_linear_tree_tutorial.py
+++ b/docs/examples/plot_linear_tree_tutorial.py
@@ -2,35 +2,38 @@
 Handling Data with Many Labels Using Linear Methods
 ====================================================
 
-For the case that the amount of labels is very large,
-the training time of the standard ``train_1vsrest`` method may be unpleasantly long.
-The ``train_tree`` method in LibMultiLabel can vastly improve the training time on such data sets.
+For datasets with a very large number of labels, the training time of the standard ``train_1vsrest`` method can be prohibitively long. LibMultiLabel offers tree-based methods like ``train_tree`` and ``train_ensemble_tree`` to vastly improve training time in such scenarios.
 
-To illustrate this speedup, we will use the `EUR-Lex dataset <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html#EUR-Lex>`_, which contains 3,956 labels.
-The data in the following example is downloaded under the directory ``data/eur-lex``
 
-Users can use the following command to easily apply the ``train_tree`` method.
-
-.. code-block:: bash
-
-    $ python3 main.py --training_file data/eur-lex/train.txt
-                      --test_file data/eur-lex/test.txt
-                      --linear
-                      --linear_technique tree
-
-Besides CLI usage, users can also use API to apply ``train_tree`` method.
-Below is an example.
+We will use the `EUR-Lex dataset <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html#EUR-Lex>`_, which contains 3,956 labels. The data is assumed to be downloaded under the directory ``data/eur-lex``.
 """
 
 import math
 import libmultilabel.linear as linear
 import time
 
+# Load and preprocess the dataset
 datasets = linear.load_dataset("txt", "data/eurlex/train.txt", "data/eurlex/test.txt")
 preprocessor = linear.Preprocessor()
 datasets = preprocessor.fit_transform(datasets)
 
 
+######################################################################
+# Standard Training and Prediction
+# --------------------------------
+#
+# Users can use the following command to easily apply the ``train_tree`` method.
+#
+# .. code-block:: bash
+#
+#     $ python3 main.py --training_file data/eur-lex/train.txt \\
+#                       --test_file data/eur-lex/test.txt \\
+#                       --linear \\
+#                       --linear_technique tree
+#
+# Besides CLI usage, users can also use API to apply ``train_tree`` method.
+# Below is an example.
+
 training_start = time.time()
 # the standard one-vs-rest method for multi-label problems
 ovr_model = linear.train_1vsrest(datasets["train"]["y"], datasets["train"]["x"])
@@ -99,3 +102,81 @@ def metrics_in_batches(model):
 print("Score of 1vsrest:", metrics_in_batches(ovr_model))
 print("Score of tree:", metrics_in_batches(tree_model))
 
+
+######################################################################
+# Ensemble of Tree Models
+# -----------------------
+#
+# While the ``train_tree`` method offers a significant speedup, its accuracy can sometimes be slightly lower than the standard one-vs-rest approach.
+# The ``train_ensemble_tree`` method can help bridge this gap by training multiple tree models and averaging their predictions.
+#
+# Users can use the following command to easily apply the ``train_ensemble_tree`` method.
+# The number of trees in the ensemble can be controlled with the ``--tree_ensemble_models`` argument.
+#
+# .. code-block:: bash
+#
+#     $ python3 main.py --training_file data/eur-lex/train.txt \\
+#                       --test_file data/eur-lex/test.txt \\
+#                       --linear \\
+#                       --linear_technique tree \\
+#                       --tree_ensemble_models 3
+#
+# This command trains an ensemble of 3 tree models. If ``--tree_ensemble_models`` is not specified, it defaults to 1 (a single tree).
+#
+# Besides CLI usage, users can also use the API to apply the ``train_ensemble_tree`` method.
+# Below is an example.
+
+# We have already trained a single tree model as a baseline.
+# Now, let's train an ensemble of 3 tree models.
+training_start = time.time()
+ensemble_model = linear.train_ensemble_tree(
+    datasets["train"]["y"], datasets["train"]["x"], n_trees=3
+)
+training_end = time.time()
+print("Training time of ensemble tree: {:10.2f}".format(training_end - training_start))
+
+######################################################################
+# On a machine with an AMD-7950X CPU,
+# the ``train_ensemble_tree`` function with 3 trees took `421.15` seconds,
+# while the single tree took `144.37` seconds.
+# As expected, training an ensemble takes longer, roughly proportional to the number of trees.
+#
+# Now, let's see if this additional training time translates to better performance.
+# We'll compute the same P@K metrics on the test set for both the single tree and the ensemble model.
+
+# `tree_preds` and `target` are already computed in the previous section.
+ensemble_preds = linear.predict_values(ensemble_model, datasets["test"]["x"])
+
+# `tree_score` is already computed.
+print("Score of single tree:", tree_score)
+
+ensemble_score = linear.compute_metrics(ensemble_preds, target, ["P@1", "P@3", "P@5"])
+print("Score of ensemble tree:", ensemble_score)
+
+######################################################################
+# While training an ensemble takes longer, it often leads to better predictive performance.
+# The following table shows a comparison between a single tree and ensembles
+# of 3, 10, and 15 trees on several benchmark datasets.
+#
+# .. table:: Benchmark Results for Single and Ensemble Tree Models (P@K in %)
+#
+#    +---------------+-----------------+-------+-------+-------+
+#    | Dataset       | Model           | P@1   | P@3   | P@5   |
+#    +===============+=================+=======+=======+=======+
+#    | EURLex-4k     | Single Tree     | 82.35 | 68.98 | 57.62 |
+#    |               +-----------------+-------+-------+-------+
+#    |               | Ensemble-3      | 82.38 | 69.28 | 58.01 |
+#    |               +-----------------+-------+-------+-------+
+#    |               | Ensemble-10     | 82.74 | 69.66 | 58.39 |
+#    |               +-----------------+-------+-------+-------+
+#    |               | Ensemble-15     | 82.61 | 69.56 | 58.29 |
+#    +---------------+-----------------+-------+-------+-------+
+#    | EURLex-57k    | Single Tree     | 90.77 | 80.81 | 67.82 |
+#    |               +-----------------+-------+-------+-------+
+#    |               | Ensemble-3      | 91.02 | 81.06 | 68.26 |
+#    |               +-----------------+-------+-------+-------+
+#    |               | Ensemble-10     | 91.23 | 81.22 | 68.34 |
+#    |               +-----------------+-------+-------+-------+
+#    |               | Ensemble-15     | 91.25 | 81.31 | 68.34 |
+#    +---------------+-----------------+-------+-------+-------+
+

From a67ee1185e1df669c86b3a4abf2400fb5c8d4434 Mon Sep 17 00:00:00 2001
From: shenkha <legiakhang03@gmail.com>
Date: Wed, 13 Aug 2025 14:31:39 +0400
Subject: [PATCH 20/20] final fix with seed

---
 libmultilabel/linear/tree.py | 7 +++++--
 linear_trainer.py            | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index f1fa7ec2..34f06370 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -422,8 +422,8 @@ def train_ensemble_tree(
     K: int = DEFAULT_K,
     dmax: int = DEFAULT_DMAX,
     n_trees: int = 3,
-    seed: int = 42,
     verbose: bool = True,
+    seed: int = None,
 ) -> EnsembleTreeModel:
     """Trains an ensemble of tree models (Parabel/Bonsai-style).
     Args:
@@ -433,12 +433,15 @@ def train_ensemble_tree(
         K (int, optional): Maximum degree of nodes in the tree. Defaults to 100.
         dmax (int, optional): Maximum depth of the tree. Defaults to 10.
         n_trees (int, optional): Number of trees in the ensemble. Defaults to 3.
-        seed (int, optional): The base random seed for the ensemble. Defaults to 42.
         verbose (bool, optional): Output extra progress information. Defaults to True.
+        seed (int, optional): The base random seed for the ensemble. Defaults to None, which will use 42.
 
     Returns:
         EnsembleTreeModel: An ensemble model which can be used for prediction.
     """
+    if seed is None:
+        seed = 42
+        
     tree_models = []
     for i in range(n_trees):
         np.random.seed(seed + i)
diff --git a/linear_trainer.py b/linear_trainer.py
index 8fbf699a..b9133857 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -57,7 +57,7 @@ def linear_train(datasets, config):
                 K=config.tree_degree,
                 dmax=config.tree_max_depth,
                 n_trees=config.tree_ensemble_models,
-                seed=config.seed if config.seed is not None else 42,
+                seed=config.seed,
             )
         else:
             model = LINEAR_TECHNIQUES[config.linear_technique](