diff --git a/grid.py b/grid.py
new file mode 100644
index 00000000..2a83b3c6
--- /dev/null
+++ b/grid.py
@@ -0,0 +1,326 @@
+from dataclasses import make_dataclass, field, fields, asdict
+from typing import Callable
+
+import os
+import sys
+import itertools
+import argparse
+import logging
+
+import libmultilabel.linear as linear
+from libmultilabel.linear.tree import _build_tree
+from libmultilabel.common_utils import timer
+
+import sklearn.preprocessing
+import numpy as np
+import math
+
+
+# suppress inevitable outputs from sparsekmeans and sklearn preprocessors
+class __silent__:
+    def __init__(self):
+        self.stderr = os.dup(2)
+        self.devnull = os.open(os.devnull, os.O_WRONLY)
+
+    def __enter__(self):
+        os.dup2(self.devnull, 2)
+        self.stdout = sys.stdout
+        sys.stdout = open(os.devnull, "w")
+
+    def __exit__(self, type, value, traceback):
+        os.dup2(self.stderr, 2)
+        os.close(self.devnull)
+        os.close(self.stderr)
+        sys.stdout.close()
+        sys.stdout = self.stdout
+
+
+class GridParameter:
+
+    _tfidf_fields = [
+        ("ngram_range", tuple[int, int], field(default=(1, 1))),
+        ("max_features", int, field(default=None)),
+        ("min_df", float | int, field(default=1)),
+        ("stop_words", str | list, field(default=None)),
+        ("strip_accents", str | Callable, field(default=None)),
+        ("tokenizer", Callable, field(default=None)),
+    ]
+    _tree_fields = [
+        ("dmax", int, field(default=10)),
+        ("K", int, field(default=8)),
+    ]
+    _linear_fields = [
+        ("s", int, field(default=1)),
+        ("c", float, field(default=1)),
+        ("B", int, field(default=-1)),
+    ]
+    _predict_fields = [
+        ("beam_width", int, field(default=10)),
+        ("prob_A", int, field(default=3)),
+    ]
+
+    param_types = {
+        "tfidf": make_dataclass("TfidfParams", _tfidf_fields, frozen=True, order=True),
+        "tree": make_dataclass("TreeParams", _tree_fields, frozen=True, order=True),
+        "linear": make_dataclass("LinearParams", _linear_fields, frozen=True, order=True),
+        "predict": make_dataclass("PredictParams", _predict_fields, frozen=True, order=True),
+    }
+    _param_field_names = {
+        param_type: {f.name for f in fields(class_name)} for param_type, class_name in param_types.items()
+    }
+
+    def __init__(self, params: dict | None = None, fold: int = -1):
+        self.params = params or {}
+
+        params_set = set(self.params)
+        for param_type, class_name in self.param_types.items():
+            field_names = self._param_field_names[param_type]
+            filtered_keys = params_set & field_names
+            params_set -= field_names
+
+            filtered_params = {k: self.params[k] for k in filtered_keys}
+            setattr(self, param_type, class_name(**filtered_params))
+
+    @property
+    def linear_options(self):
+        options = ""
+        for field_name in self._param_field_names["linear"]:
+            options += f" -{field_name} {getattr(self.linear, field_name)}"
+        return options.strip()
+
+    def __repr__(self):
+        return str(self.params)
+
+    def __eq__(self, other):
+        return all(getattr(self, t) == getattr(other, t) for t in self.param_types)
+
+    def __lt__(self, other):
+        # "<" for tuple is automatically lexicographic ordering
+        my_values = tuple(getattr(self, t) for t in self.param_types)
+        other_values = tuple(getattr(other, t) for t in self.param_types)
+        return my_values < other_values
+
+    def __hash__(self):
+        return hash(tuple(getattr(self, t) for t in self.param_types))
+
+
+class GridSearch:
+    def __init__(
+        self,
+        datasets: dict[str, np.matrix],
+        n_folds: int = 3,
+        monitor_metrics: list[str] = ["P@1", "P@3", "P@5"],
+    ):
+        self.datasets = datasets
+        self.n_folds = n_folds
+        self.monitor_metrics = monitor_metrics
+        self.param_metrics = dict()
+
+        self._cached_params = GridParameter()
+        for param_type in self._cached_params.param_types:
+            setattr(self._cached_params, param_type, None)
+        self._cached_transformed_dataset = None
+        self._cached_tree_root = None
+        self._cached_fold_data = None
+        self._cached_model = None
+        self.no_cache = True
+
+        self.num_instances = len(self.datasets["train"]["y"])
+
+    def get_fold_dataset(self, train_idx, valid_idx):
+        def take(data, idx):
+            if isinstance(data, list):
+                return [data[i] for i in idx]
+            else:
+                return data[idx]
+
+        return {
+            "data_format": self.datasets["data_format"],
+            "train": {
+                "y": take(self.datasets["train"]["y"], train_idx),
+                "x": take(self.datasets["train"]["x"], train_idx),
+            },
+            "test": {
+                "y": take(self.datasets["train"]["y"], valid_idx),
+                "x": take(self.datasets["train"]["x"], valid_idx),
+            },
+        }
+
+    def get_transformed_dataset(self, dataset, params):
+        """
+        Get the dataset for the given tf-idf params.
+
+        Args:
+            params (GridParameter): The params to build the dataset.
+
+        Returns:
+            dict[str, np.matrix]: The keys should be "y" and "x".
+        """
+        tfidf_params = params.tfidf
+        self.no_cache = tfidf_params != self._cached_params.tfidf
+        if self.no_cache:
+            logging.info(f"TFIDF  - Preprocessing: {tfidf_params}")
+            if self.datasets["data_format"] not in {"txt", "dataframe"}:
+                logging.info("The TF-IDF parameters are only meaningful for the “txt” and “dataframe” data formats.")
+            with __silent__():
+                preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params))
+                self._cached_params.tfidf = tfidf_params
+                self._cached_transformed_dataset = preprocessor.fit_transform(dataset)
+        else:
+            logging.info(f"TFIDF  - Using cached data: {tfidf_params}")
+
+        return self._cached_transformed_dataset
+
+    def get_tree_root(self, y, x, params):
+        tree_params = params.tree
+        self.no_cache |= tree_params != self._cached_params.tree
+        if self.no_cache:
+            logging.info(f"Tree   - Preprocessing: {tree_params}")
+            with __silent__():
+                label_representation = (y.T * x).tocsr()
+                label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
+                self._cached_params.tree = tree_params
+                self._cached_tree_root = _build_tree(
+                    label_representation, np.arange(y.shape[1]), 0, **asdict(tree_params)
+                )
+                self._cached_tree_root.is_root = True
+        else:
+            logging.info(f"Tree   - Using cached data: {tree_params}")
+
+        return self._cached_tree_root
+
+    def get_model(self, y, x, params):
+        """
+        Get the model for the given params.
+
+        Args:
+            y (np.matrix): The labels of the training data.
+            x (np.matrix): The features of the training data.
+            params (GridParameter): The params to build the model.
+
+        Returns:
+            linear.FlatModel | linear.TreeModel: The model for the given params.
+        """
+        root = self.get_tree_root(y, x, params)
+
+        linear_params = params.linear
+
+        if self.no_cache or (linear_params != self._cached_params.linear):
+            logging.info(f"Model  - Training: {linear_params}")
+            with __silent__():
+                self._cached_params.linear = linear_params
+                self._cached_model = linear.train_tree(
+                    y,
+                    x,
+                    root=root,
+                    options=params.linear_options,
+                )
+        else:
+            logging.info(f"Model  - Using cached data: {linear_params}")
+
+        return self._cached_model
+
+    def compute_scores(self, y, x, model, params):
+        logging.info(f"Metric - Scoring: {params.predict}\n")
+
+        batch_size = 256
+        num_instances = x.shape[0]
+        num_batches = math.ceil(num_instances / batch_size)
+
+        if params not in self.param_metrics.keys():
+            self.param_metrics[params] = linear.get_metrics(self.monitor_metrics, num_classes=y.shape[1])
+
+        for i in range(num_batches):
+            preds = model.predict_values(x[i * batch_size : (i + 1) * batch_size], **asdict(params.predict))
+            target = y[i * batch_size : (i + 1) * batch_size].toarray()
+            self.param_metrics[params].update(preds, target)
+
+    def __call__(self, search_space_dict: dict[str, list]) -> dict[GridParameter, dict[str, float]]:
+        self.param_metrics.clear()
+
+        param_names = search_space_dict.keys()
+        self.search_space = sorted(
+            [
+                GridParameter(dict(zip(param_names, param_values)))
+                for param_values in itertools.product(*search_space_dict.values())
+            ],
+            reverse=True,
+        )
+
+        permutation = np.random.permutation(self.num_instances)
+        index_per_fold = [
+            permutation[
+                int(fold * self.num_instances / self.n_folds) : int((fold + 1) * self.num_instances / self.n_folds)
+            ]
+            for fold in range(self.n_folds)
+        ]
+
+        for fold in range(self.n_folds):
+            train_idx = np.concatenate(index_per_fold[:fold] + index_per_fold[fold + 1 :])
+            valid_idx = index_per_fold[fold]
+            fold_dataset = self.get_fold_dataset(train_idx, valid_idx)
+
+            self._cached_params.tfidf = None
+            for params in self.search_space:
+                logging.info(f"Status - Running fold {fold}, params: {params}")
+
+                transformed_dataset = self.get_transformed_dataset(fold_dataset, params)
+                model = self.get_model(transformed_dataset["train"]["y"], transformed_dataset["train"]["x"], params)
+                self.compute_scores(transformed_dataset["test"]["y"], transformed_dataset["test"]["x"], model, params)
+
+        return {params: metrics.compute() for params, metrics in self.param_metrics.items()}
+
+
+@timer
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, help="Random seed.")
+    parser.add_argument("--training_file", help="Path to training data.")
+    parser.add_argument("--test_file", help="Path to test data.")
+    parser.add_argument(
+        "--data_format", type=str, default="txt", help="'svm' for SVM format or 'txt' for LibMultiLabel format."
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+    if args.seed is not None:
+        np.random.seed(args.seed)
+
+    dataset = linear.load_dataset(
+        args.data_format,
+        args.training_file,
+        args.test_file,
+    )
+
+    retrain = True
+    n_folds = 3
+    monitor_metrics = ["P@1", "P@3", "P@5"]
+    search_space_dict = {
+        "max_features": [10000, 20000, 100000],
+        "K": [10, 50, 100],
+        "min_df": [1, 2],
+        "prob_A": [2, 3, 4],
+        "c": [0.1, 0.2, 1, 10],
+    }
+
+    search = GridSearch(dataset, n_folds, monitor_metrics)
+    cv_scores = search(search_space_dict)
+    sorted_cv_scores = sorted(cv_scores.items(), key=lambda x: x[1][monitor_metrics[0]], reverse=True)
+    print(sorted_cv_scores)
+
+    if retrain:
+        best_params, best_cv_scores = list(sorted_cv_scores)[0]
+        print(best_params, best_cv_scores)
+
+        preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf))
+        transformed_dataset = preprocessor.fit_transform(dataset)
+        model = linear.train_tree(
+            transformed_dataset["train"]["y"],
+            transformed_dataset["train"]["x"],
+            best_params.linear_options,
+            **asdict(best_params.tree),
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 7f1ce851..eb4934eb 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import scipy.sparse as sparse
+from scipy.special import log_expit
 from sparsekmeans import LloydKmeans, ElkanKmeans
 import sklearn.preprocessing
 from tqdm import tqdm
@@ -58,16 +59,31 @@ def __init__(
         self.multiclass = False
         self._model_separated = False # Indicates whether the model has been separated for pruning tree.
 
+    def sigmoid_A(self, x: np.ndarray, prob_A: int):
+        """
+        Calculate log(sigmoid(prob_A * x)).
+
+        Args:
+            x (np.ndarray): A matrix with dimension number of instances * number of classes.
+            prob_A (int): The tunable parameter of probability estimation function, that is sigmoid(prob_A * preds).
+
+        Returns:
+            np.ndarray: A matrix with dimension number of instances * number of classes.
+        """
+        return log_expit(prob_A * x)
+
     def predict_values(
         self,
         x: sparse.csr_matrix,
         beam_width: int = 10,
+        prob_A: int = 3,
     ) -> np.ndarray:
         """Calculate the probability estimates associated with x.
 
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
             beam_width (int, optional): Number of candidates considered during beam search. Defaults to 10.
+            prob_A (int, optional): The tunable parameter of probability estimation function, that is sigmoid(prob_A * preds). Defaults to 3.
 
         Returns:
             np.ndarray: A matrix with dimension number of instances * number of classes.
@@ -81,8 +97,8 @@ def predict_values(
             if not self._model_separated:
                 self._separate_model_for_pruning_tree()
                 self._model_separated = True
-            all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels)
-        return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])])
+            all_preds = self._prune_tree_and_predict_values(x, beam_width, prob_A) # number of instances * (number of labels + total number of metalabels)
+        return np.vstack([self._beam_search(all_preds[i], beam_width, prob_A) for i in range(all_preds.shape[0])])
 
     def _separate_model_for_pruning_tree(self):
         """
@@ -113,7 +129,7 @@ def _separate_model_for_pruning_tree(self):
             )
             self.subtree_models.append(subtree_flatmodel)
         
-    def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray:
+    def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, prob_A: int) -> np.ndarray:
         """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees.
 
         Only subtrees corresponding to the top beam_width candidates from the root are evaluated,
@@ -122,6 +138,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
             beam_width (int): Number of top candidate branches considered for prediction.
+            prob_A (int, optional): The tunable parameter of probability estimation function, that is sigmoid(prob_A * preds).
 
         Returns:
             np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels).
@@ -132,7 +149,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         # Calculate root decision values and scores
         root_preds = linear.predict_values(self.root_model, x)
-        children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds))
+        children_scores = 0.0 + self.sigmoid_A(root_preds, prob_A)
 
         slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]]
         all_preds[slice] = root_preds
@@ -159,12 +176,13 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         return all_preds
 
-    def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarray:
+    def _beam_search(self, instance_preds: np.ndarray, beam_width: int, prob_A: int) -> np.ndarray:
         """Predict with beam search using cached probability estimates for a single instance.
 
         Args:
             instance_preds (np.ndarray): A vector of cached probability estimates of each node, has dimension number of labels + total number of metalabels.
             beam_width (int): Number of candidates considered.
+            prob_A (int, optional): The tunable parameter of probability estimation function, that is sigmoid(prob_A * preds).
 
         Returns:
             np.ndarray: A vector with dimension number of classes.
@@ -182,7 +200,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
                     continue
                 slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
                 pred = instance_preds[slice]
-                children_score = score - np.square(np.maximum(0, 1 - pred))
+                children_score = score + self.sigmoid_A(pred, prob_A)
                 next_level.extend(zip(node.children, children_score.tolist()))
 
             cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width]
@@ -193,7 +211,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
         for node, score in cur_level:
             slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
             pred = instance_preds[slice]
-            scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred)))
+            scores[node.label_map] = np.exp(score + self.sigmoid_A(pred, prob_A))
         return scores
 
 
@@ -204,6 +222,7 @@ def train_tree(
     K=DEFAULT_K,
     dmax=DEFAULT_DMAX,
     verbose: bool = True,
+    root: Node = None,
 ) -> TreeModel:
     """Train a linear model for multi-label data using a divide-and-conquer strategy.
     The algorithm used is based on https://github.com/xmc-aalto/bonsai.
@@ -215,14 +234,16 @@ def train_tree(
         K (int, optional): Maximum degree of nodes in the tree. Defaults to 100.
         dmax (int, optional): Maximum depth of the tree. Defaults to 10.
         verbose (bool, optional): Output extra progress information. Defaults to True.
+        root (Node, optional): Pre-built tree root. Defaults to None.
 
     Returns:
         TreeModel: A model which can be used in predict_values.
     """
-    label_representation = (y.T * x).tocsr()
-    label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
-    root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax)
-    root.is_root = True
+    if root is None:
+        label_representation = (y.T * x).tocsr()
+        label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
+        root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax)
+        root.is_root = True
 
     num_nodes = 0
     # Both type(x) and type(y) are sparse.csr_matrix
diff --git a/linear_trainer.py b/linear_trainer.py
index b9133857..f5f374fa 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -24,6 +24,7 @@ def linear_test(config, model, datasets, label_mapping):
     predict_kwargs = {}
     if isinstance(model, (TreeModel, EnsembleTreeModel)):
         predict_kwargs["beam_width"] = config.beam_width
+        predict_kwargs["prob_A"] = config.prob_A
 
     for i in tqdm(range(ceil(num_instance / config.eval_batch_size))):
         slice = np.s_[i * config.eval_batch_size : (i + 1) * config.eval_batch_size]
diff --git a/main.py b/main.py
index 7a523f1f..99b4f59a 100644
--- a/main.py
+++ b/main.py
@@ -252,6 +252,12 @@ def add_all_arguments(parser):
         default=10,
         help="The width of the beam search (default: %(default)s)",
     )
+    parser.add_argument(
+        "--prob_A",
+        type=int,
+        default=3,
+        help="The tunable parameter of probability estimation function, that is sigmoid(prob_A * preds) (default: %(default)s)",
+    )
     # AttentionXML
     parser.add_argument(
         "--cluster_size",