From 575eaf46aec175736b60b07566eca9c189ae8ebf Mon Sep 17 00:00:00 2001 From: chcwww Date: Tue, 28 Oct 2025 03:31:19 +0000 Subject: [PATCH 01/23] update mvp --- grid.py | 171 +++++++++++++++++++++++++++++++++ libmultilabel/linear/linear.py | 1 + libmultilabel/linear/tree.py | 25 ++++- linear_trainer.py | 2 + run_exp.py | 93 ++++++++++++++++++ 5 files changed, 288 insertions(+), 4 deletions(-) create mode 100644 grid.py create mode 100644 run_exp.py diff --git a/grid.py b/grid.py new file mode 100644 index 00000000..b9769f2c --- /dev/null +++ b/grid.py @@ -0,0 +1,171 @@ +from abc import abstractmethod + +import libmultilabel.linear as linear +import numpy as np +import math + +class Parameter: + def __init__(self, **params): + self.params = params + + def tfidf(self): # pad default value for compatibility + return self.params['tfidf'] + + def tree(self): + return self.params['tree'] + + def params(self): + return self.params['params'] + + def inference(self): + return self.params['inference'] + + +param = Parameter(tfidf={'min_df': 1, 'max_features': 10000}, tree={'K': 2, 'dmax': 100}) + +class GridSearch: + def __init__(self, data_source, n_folds, search_space, config=None): + self.data_source = data_source + self.search_space = search_space + self.config = config + self.n_folds = n_folds + self.metrics = ["P@1", "P@3", "P@5"] + + def __call__(self): + self.build_data() + self.build_fold_idx() + + results = { + (str(tfidf_param), str(param)): {metric: 0 for metric in self.metrics} + for tfidf_param in self.search_space['tfidf'] for param in self.search_space['params'] + } + # for fold, params in zip(self.fold_space, self.search_space): + for tfidf_param in self.search_space['tfidf']: # param should be an instance of a config class + avg_score = {metric: 0 for metric in self.metrics} + for i in range(self.n_folds): + y_train_fold, x_train_fold, y_valid_fold, x_valid_fold = \ + self.get_fold_data(i, tfidf_param) + for tree + for param in self.search_space['params']: + print(f'\nRunning fold {i}\ntfidf: {tfidf_param}\nparams: {param}') + model = self.get_model(y_train_fold, x_train_fold, param) + cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, model, param) + print(f'cv_score: {cv_score}\n') + for metric in self.metrics: + results[(str(tfidf_param), str(param))][metric] += cv_score[metric] / self.n_folds + + # TODO: Return a function + return sorted(results.items(), key=lambda x: x[1][self.metrics[0]], reverse=True) + + def build_fold_idx(self): + permutation = np.random.permutation(self.num_instances) + index_per_fold = [ + permutation[int(fold * self.num_instances / self.n_folds):int((fold+1) * self.num_instances / self.n_folds)] + for fold in range(self.n_folds) + ] + + self.fold_idx = { + fold: { + 'train': np.concatenate(index_per_fold[:fold] + index_per_fold[fold+1:]), + 'valid': index_per_fold[fold] + } for fold in range(self.n_folds) + } + + @abstractmethod + def build_data(self): + pass + + @abstractmethod + def get_fold_data(self, i, param): + pass + + @abstractmethod + def get_model(self, y_train_fold, x_train_fold, param): + pass + + @abstractmethod + def get_cv_score(self, y_valid_fold, x_valid_fold, model, param): + pass + + +class HyperparameterSearch(GridSearch): + def __init__(self, data_source, n_folds, search_space, config=None): + super().__init__(data_source, n_folds, search_space, config) + + def preprocess_tfidf(self, dataset, param): + preprocessor = linear.Preprocessor(tfidf_params=param) + return preprocessor.fit_transform(dataset) + + def build_data(self): + self.data = {} + + dataset = linear.load_dataset("svm", self.data_source[0], self.data_source[1]) + self.num_instances = len(dataset["train"]["y"]) + tfidf_params = self.search_space['tfidf'] + for param in tfidf_params: + print(f'Preprocessing tfidf: {param}..') + tfidf_data = self.preprocess_tfidf(dataset, param) + self.data[str(param)] = {'dataset': tfidf_data} + # use yield? (however, hard to reuse) + + def get_fold_data(self, i, param): + dataset = self.data[str(param)]['dataset']["train"] + return ( + dataset["y"][self.fold_idx[i]['train']], dataset["x"][self.fold_idx[i]['train']], + dataset["y"][self.fold_idx[i]['valid']], dataset["x"][self.fold_idx[i]['valid']] + ) + + def get_model(self, y_train_fold, x_train_fold, param): + model = linear.train_tree(y_train_fold, x_train_fold, **param) # train with param and fold data + return model + + def metrics_in_batches(self, y, x, model, *args, **kwargs): + batch_size = 256 + num_instances = x.shape[0] + num_batches = math.ceil(num_instances / batch_size) + + metrics = linear.get_metrics(self.metrics, num_classes=y.shape[1]) + + for i in range(num_batches): + preds = linear.predict_values(model, x[i * batch_size : (i + 1) * batch_size]) + target = y[i * batch_size : (i + 1) * batch_size].toarray() + metrics.update(preds, target) + + return metrics.compute() + + def get_cv_score(self, y_valid_fold, x_valid_fold, model, param): + # calculate the metric with the model + score = self.metrics_in_batches( + y_valid_fold, + x_valid_fold, + model, + **param + ) + return score + + +class ProbEstimatiteSearch(GridSearch): + def __init__(self, data_source, n_folds, search_space, config=None): + super().__init__(data_source, n_folds, search_space, config) + + def build_data(self): + data = {'unique': {}} + unique_data = None # from libmultilabel preprocessing + for i in range(self.n_folds): + train_idx, valid_idx = None, None + y_train_fold, x_train_fold = unique_data[train_idx] + y_valid_fold, x_valid_fold = unique_data[valid_idx] + data['unique'][i] = unique_data + + return data + + def get_fold_data(self, data, i, param): + return data['unique'][i] + + def get_model(self, y_train_fold, x_train_fold, param): + model = None # train normally with fold data + return model + + def get_cv_score(self, y_valid_fold, x_valid_fold, model, param): + score = None # calculate the metric with the model and the hyperparameter A + return score diff --git a/libmultilabel/linear/linear.py b/libmultilabel/linear/linear.py index 04d25a21..6a47800a 100644 --- a/libmultilabel/linear/linear.py +++ b/libmultilabel/linear/linear.py @@ -198,6 +198,7 @@ def train_1vsrest( multiclass: bool = False, options: str = "", verbose: bool = True, + *args, **kwargs, ) -> FlatModel: """Train a linear model parallel on labels for multi-label data using a one-vs-rest strategy. diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 7f1ce851..c4bada55 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -204,6 +204,7 @@ def train_tree( K=DEFAULT_K, dmax=DEFAULT_DMAX, verbose: bool = True, + *args, **kwargs, ) -> TreeModel: """Train a linear model for multi-label data using a divide-and-conquer strategy. The algorithm used is based on https://github.com/xmc-aalto/bonsai. @@ -264,6 +265,18 @@ def visit(node): return TreeModel(root, flat_model, node_ptr) +import os, sys + +class silent_print: + def __enter__(self): + self._original_stdout = sys.stdout + sys.stdout = open(os.devnull, 'w') + + def __exit__(self, exc_type, exc_val, exc_tb): + sys.stdout.close() + sys.stdout = self._original_stdout + + def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int, K: int, dmax: int) -> Node: """Build the tree recursively by kmeans clustering. @@ -284,10 +297,14 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, else: kmeans_algo = LloydKmeans - kmeans = kmeans_algo( - n_clusters=K, max_iter=300, tol=0.0001, random_state=np.random.randint(2**31 - 1), verbose=True - ) - metalabels = kmeans.fit(label_representation) + if True: + metalabels = np.random.randint(0, K, label_representation.shape[0]) + else: + with silent_print(): + kmeans = kmeans_algo( + n_clusters=K, max_iter=300, tol=0.0001, random_state=np.random.randint(2**31 - 1), verbose=False + ) + metalabels = kmeans.fit(label_representation) unique_labels = np.unique(metalabels) if len(unique_labels) == K: diff --git a/linear_trainer.py b/linear_trainer.py index b9133857..1105f62a 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -45,6 +45,7 @@ def linear_train(datasets, config): multiclass = is_multiclass_dataset(datasets["train"], "y") # train + # 1 if config.linear_technique == "tree": if multiclass: raise ValueError("Tree model should only be used with multilabel datasets.") @@ -74,6 +75,7 @@ def linear_train(datasets, config): multiclass=multiclass, options=config.liblinear_options, ) + # 2 return model diff --git a/run_exp.py b/run_exp.py new file mode 100644 index 00000000..c3c16257 --- /dev/null +++ b/run_exp.py @@ -0,0 +1,93 @@ +import libmultilabel.linear as linear +import grid as grid +import numpy as np + +import time +import json +from tqdm import tqdm + + +def run_ovr(dataset, options, *args, **kwargs): + training_start = time.time() + ovr_model = linear.train_1vsrest( + dataset["train"]["y"], + dataset["train"]["x"], + options=options + ) + training_time = time.time() - training_start + return ovr_model, training_time + +def run_tree(dataset, options, K, dmax, *args, **kwargs): + training_start = time.time() + tree_model = linear.train_tree( + dataset["train"]["y"], + dataset["train"]["x"], + options=options, + K=K, + dmax=dmax + ) + training_time = time.time() - training_start + return tree_model, training_time + + +if __name__ == "__main__": + import argparse + np.random.seed(20250820) + + parser = argparse.ArgumentParser(description="Parse command-line arguments.") + parser.add_argument("--dataset", type=str, default="EUR-Lex", help="Dataset name (e.g., AmazonCat-13K, EUR-Lex)") + args = parser.parse_args() + + dataset_ = args.dataset + + # dataset = linear.load_dataset("svm", f"data/{dataset_}/train.svm") # , f"data/{dataset}/test.svm" + data_source = [f'data/{dataset_}/train.svm', f'data/{dataset_}/test.svm'] + search_space = { + 'tfidf': { + 'min_df': [1, 2], + 'max_features': [10000, 320000], + }, + 'params': { + 'C': [1, 2], + 'K': [2, 100], + }, + } + search_space = { + 'tfidf': [ + {'max_features': i} for i in [10000] + ], + 'params': [ + {'K': i} for i in [2, 100] + ], + } + print(search_space) + n_folds = 3 + grid_search = grid.HyperparameterSearch(data_source, n_folds, search_space) + results = grid_search() + print(results) + # if num_classes != -1: + # dataset["train"]["y"] = [[yij % num_classes for yij in yi] for yi in dataset["train"]["y"]] + + # preprocessor = linear.Preprocessor() + # dataset = preprocessor.fit_transform(dataset) + + # results = { + # exp_name: { + # t: 0 for t in exp_threads + # } + # for exp_name in exp_names + # } + + # for exp_name in exp_names: + # for exp_thread in tqdm(exp_threads, leave=True, colour="blue", desc=exp_name): + # if exp_name == 'Strategy B': + # do_parallel = True + # options = "-m 1" + # num_threads = exp_thread + # else: + # do_parallel = False + # options = f"-m {exp_thread}" + # num_threads = -1 + + # _, training_time = run_ovr(dataset, options, num_threads, do_parallel, use_dedicated_x) + # results[exp_name][exp_thread] = training_time From 95da0085f601a41272710dc9a1457c2e939aad7c Mon Sep 17 00:00:00 2001 From: chcwww Date: Tue, 28 Oct 2025 18:38:11 +0000 Subject: [PATCH 02/23] almost finish GridParameter --- grid.py | 245 ++++++++++++++++++++++------------- libmultilabel/linear/tree.py | 10 +- 2 files changed, 158 insertions(+), 97 deletions(-) diff --git a/grid.py b/grid.py index b9769f2c..fe3af7b4 100644 --- a/grid.py +++ b/grid.py @@ -1,27 +1,72 @@ from abc import abstractmethod +from dataclasses import make_dataclass, field, fields, asdict +from typing import Callable import libmultilabel.linear as linear import numpy as np import math -class Parameter: - def __init__(self, **params): + +class GridParameter: + + _tfidf_fields = [ + ('ngram_range', tuple[int, int], field(default=(1, 1))), + ('max_features', int, field(default=None)), + ('min_df', float | int, field(default=1)), + ('stop_words', str | list, field(default=None)), + ('strip_accents', str | Callable, field(default=None)), + ('tokenizer', Callable, field(default=None)), + ] + _tree_fields = [ + ('dmax', int, field(default=10)), + ('K', int, field(default=8)), + ] + _linear_fields = [ + ('s', int, field(default=1)), + ('c', float, field(default=1)), + ('B', int, field(default=-1)), + ] + _predict_fields = [ + ('beam_width', int, field(default=10)), + ('A', int, field(default=1)), + ] + + param_types = { + 'tfidf': make_dataclass('_TfidfParams', _tfidf_fields, frozen=True, order=True), + 'tree': make_dataclass('_TreeParams', _tree_fields, frozen=True, order=True), + 'linear': make_dataclass('_LinearParams', _linear_fields, frozen=True, order=True), + 'predict': make_dataclass('_PredictParams', _predict_fields, frozen=True, order=True), + } + + def __init__(self, params: dict): self.params = params - - def tfidf(self): # pad default value for compatibility - return self.params['tfidf'] + for param_type, class_name in self.param_types.items(): + field_names = {f.name for f in fields(class_name)} + _params = {k: v for k, v in self.params.items() if k in field_names} + setattr(self, param_type, class_name(**_params)) + + @property + def linear_options(self): + options = '' + for f in fields(self.linear): + options += f" -{f.name} {getattr(self.linear, f.name)}" + return options.strip() - def tree(self): - return self.params['tree'] + def __repr__(self): + return str(self.params) - def params(self): - return self.params['params'] + def __eq__(self, other): + return all(getattr(self, t) == getattr(other, t) for t in self.param_types) - def inference(self): - return self.params['inference'] + def __lt__(self, other): + # "<" for tuple is automatically lexicographic ordering + my_values = tuple(getattr(self, t) for t in self.param_types) + other_values = tuple(getattr(other, t) for t in self.param_types) + return my_values < other_values + def __hash__(self): + return hash(tuple(getattr(self, t) for t in self.param_types)) -param = Parameter(tfidf={'min_df': 1, 'max_features': 10000}, tree={'K': 2, 'dmax': 100}) class GridSearch: def __init__(self, data_source, n_folds, search_space, config=None): @@ -31,31 +76,8 @@ def __init__(self, data_source, n_folds, search_space, config=None): self.n_folds = n_folds self.metrics = ["P@1", "P@3", "P@5"] - def __call__(self): - self.build_data() - self.build_fold_idx() - - results = { - (str(tfidf_param), str(param)): {metric: 0 for metric in self.metrics} - for tfidf_param in self.search_space['tfidf'] for param in self.search_space['params'] - } - # for fold, params in zip(self.fold_space, self.search_space): - for tfidf_param in self.search_space['tfidf']: # param should be an instance of a config class - avg_score = {metric: 0 for metric in self.metrics} - for i in range(self.n_folds): - y_train_fold, x_train_fold, y_valid_fold, x_valid_fold = \ - self.get_fold_data(i, tfidf_param) - for tree - for param in self.search_space['params']: - print(f'\nRunning fold {i}\ntfidf: {tfidf_param}\nparams: {param}') - model = self.get_model(y_train_fold, x_train_fold, param) - cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, model, param) - print(f'cv_score: {cv_score}\n') - for metric in self.metrics: - results[(str(tfidf_param), str(param))][metric] += cv_score[metric] / self.n_folds - - # TODO: Return a function - return sorted(results.items(), key=lambda x: x[1][self.metrics[0]], reverse=True) + def sort_search_space(self): + self.search_space.sort() def build_fold_idx(self): permutation = np.random.permutation(self.num_instances) @@ -71,55 +93,13 @@ def build_fold_idx(self): } for fold in range(self.n_folds) } - @abstractmethod - def build_data(self): - pass - - @abstractmethod - def get_fold_data(self, i, param): - pass - - @abstractmethod - def get_model(self, y_train_fold, x_train_fold, param): - pass - - @abstractmethod - def get_cv_score(self, y_valid_fold, x_valid_fold, model, param): - pass - - -class HyperparameterSearch(GridSearch): - def __init__(self, data_source, n_folds, search_space, config=None): - super().__init__(data_source, n_folds, search_space, config) - - def preprocess_tfidf(self, dataset, param): - preprocessor = linear.Preprocessor(tfidf_params=param) - return preprocessor.fit_transform(dataset) - - def build_data(self): - self.data = {} - - dataset = linear.load_dataset("svm", self.data_source[0], self.data_source[1]) - self.num_instances = len(dataset["train"]["y"]) - tfidf_params = self.search_space['tfidf'] - for param in tfidf_params: - print(f'Preprocessing tfidf: {param}..') - tfidf_data = self.preprocess_tfidf(dataset, param) - self.data[str(param)] = {'dataset': tfidf_data} - # use yield? (however, hard to reuse) - - def get_fold_data(self, i, param): - dataset = self.data[str(param)]['dataset']["train"] + def get_fold_data(self, dataset, i, params): return ( dataset["y"][self.fold_idx[i]['train']], dataset["x"][self.fold_idx[i]['train']], dataset["y"][self.fold_idx[i]['valid']], dataset["x"][self.fold_idx[i]['valid']] ) - def get_model(self, y_train_fold, x_train_fold, param): - model = linear.train_tree(y_train_fold, x_train_fold, **param) # train with param and fold data - return model - - def metrics_in_batches(self, y, x, model, *args, **kwargs): + def get_cv_score(self, y, x, model, params): batch_size = 256 num_instances = x.shape[0] num_batches = math.ceil(num_instances / batch_size) @@ -133,15 +113,94 @@ def metrics_in_batches(self, y, x, model, *args, **kwargs): return metrics.compute() - def get_cv_score(self, y_valid_fold, x_valid_fold, model, param): - # calculate the metric with the model - score = self.metrics_in_batches( - y_valid_fold, - x_valid_fold, - model, - **param - ) - return score + def output(self): # return sorted params list with scores by default + return sorted(self.results.items(), key=lambda x: x[1][self.metrics[0]], reverse=True) + + def __call__(self): + self.sort_search_space() + self.build_fold_idx() + + self.results = { + params: {metric: 0 for metric in self.metrics} + for params in self.search_space + } + # for fold, params in zip(self.fold_space, self.search_space): + for params in self.search_space: # params should be an instance of a config class + avg_score = {metric: 0 for metric in self.metrics} + dataset = self.get_dataset(params) + # should be 000111222... or 012012012... (for same tfidf params but different params) + # don't know whether 012012012 waste space (view or new data)? + for i in range(self.n_folds): + # secretly caching the tree root for each fold.. + y_train_fold, x_train_fold, y_valid_fold, x_valid_fold = \ + self.get_fold_data(dataset, i, params) + + print(f'\nRunning fold {i}\nparams: {params}') + self.model = self.get_model(y_train_fold, x_train_fold, params) + cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, model, params) + print(f'cv_score: {cv_score}\n') + + for metric in self.metrics: + self.results[params][metric] += cv_score[metric] / self.n_folds + + return self.output() + + @abstractmethod + def get_dataset(self, params) -> dict[str, np.matrix]: + """ + Get the dataset for the given params. + + Args: + params (GridParameter): The params to build the dataset. + + Returns: + dict[str, np.matrix]: The keys should be 'y' and 'x'. + """ + pass + + @abstractmethod + def get_model(self, y, x, params) -> linear.FlatModel | linear.TreeModel: + """ + Get the model for the given params. + + Args: + y (np.matrix): The labels of the training data. + x (np.matrix): The features of the training data. + params (GridParameter): The params to build the model. + + Returns: + linear.FlatModel | linear.TreeModel: The model for the given params. + """ + pass + + +class HyperparameterSearch(GridSearch): + def __init__(self, data_source, n_folds, search_space, config=None): + super().__init__(data_source, n_folds, search_space, config) + self._cached_tfidf_params = None + self._cached_tfidf_data = None + self._cached_tree_params = None + # pass directly in the product code (linear_trainer.py) + self.dataset = linear.load_dataset("svm", self.data_source[0], self.data_source[1]) + self.num_instances = len(self.dataset["train"]["y"]) + + def get_dataset(self, params): + tfidf_params = params.tfidf + if tfidf_params != self._cached_tfidf_params: + print(f'Preprocessing tfidf: {tfidf_params}..') + self._cached_tfidf_params = tfidf_params + self._cached_tfidf_data = linear.Preprocessor(tfidf_params=tfidf_params).fit_transform(self.dataset) + return self._cached_tfidf_data + + def get_tree_root(self, y, x, params): + label_representation = (y.T * x).tocsr() + label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) + root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax) + root.is_root = True + + def get_model(self, y, x, params): + model = linear.train_tree(y, x, **params) # train with params and fold data + return model class ProbEstimatiteSearch(GridSearch): @@ -159,13 +218,13 @@ def build_data(self): return data - def get_fold_data(self, data, i, param): + def get_fold_data(self, data, i, params): return data['unique'][i] - def get_model(self, y_train_fold, x_train_fold, param): + def get_model(self, y_train_fold, x_train_fold, params): model = None # train normally with fold data return model - def get_cv_score(self, y_valid_fold, x_valid_fold, model, param): + def get_cv_score(self, y_valid_fold, x_valid_fold, model, params): score = None # calculate the metric with the model and the hyperparameter A return score diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index c4bada55..d2044abe 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -204,6 +204,7 @@ def train_tree( K=DEFAULT_K, dmax=DEFAULT_DMAX, verbose: bool = True, + tree_root: Node = None, *args, **kwargs, ) -> TreeModel: """Train a linear model for multi-label data using a divide-and-conquer strategy. @@ -220,10 +221,11 @@ def train_tree( Returns: TreeModel: A model which can be used in predict_values. """ - label_representation = (y.T * x).tocsr() - label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) - root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax) - root.is_root = True + if tree_root is None: + label_representation = (y.T * x).tocsr() + label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) + root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax) + root.is_root = True num_nodes = 0 # Both type(x) and type(y) are sparse.csr_matrix From 47063e05419ed07ab102b038065c6d92b55d8515 Mon Sep 17 00:00:00 2001 From: chcwww Date: Tue, 28 Oct 2025 19:08:13 +0000 Subject: [PATCH 03/23] first version implemented --- grid.py | 62 +++++++++++++++++++++++++++--------- libmultilabel/linear/tree.py | 25 ++++----------- run_exp.py | 11 ++----- 3 files changed, 56 insertions(+), 42 deletions(-) diff --git a/grid.py b/grid.py index fe3af7b4..beb79595 100644 --- a/grid.py +++ b/grid.py @@ -1,12 +1,27 @@ +import os +import sys from abc import abstractmethod from dataclasses import make_dataclass, field, fields, asdict from typing import Callable import libmultilabel.linear as linear +from libmultilabel.linear.tree import _build_tree, silent_print + +import sklearn.preprocessing import numpy as np import math +class silent_print: + def __enter__(self): + self._original_stdout = sys.stdout + sys.stdout = open(os.devnull, 'w') + + def __exit__(self, exc_type, exc_val, exc_tb): + sys.stdout.close() + sys.stdout = self._original_stdout + + class GridParameter: _tfidf_fields = [ @@ -69,9 +84,9 @@ def __hash__(self): class GridSearch: - def __init__(self, data_source, n_folds, search_space, config=None): + def __init__(self, data_source: tuple[str, str], n_folds: int, search_space: list[dict], config=None): self.data_source = data_source - self.search_space = search_space + self.search_space = [GridParameter(params) for params in search_space] self.config = config self.n_folds = n_folds self.metrics = ["P@1", "P@3", "P@5"] @@ -130,14 +145,14 @@ def __call__(self): dataset = self.get_dataset(params) # should be 000111222... or 012012012... (for same tfidf params but different params) # don't know whether 012012012 waste space (view or new data)? - for i in range(self.n_folds): + for fold in range(self.n_folds): # secretly caching the tree root for each fold.. y_train_fold, x_train_fold, y_valid_fold, x_valid_fold = \ - self.get_fold_data(dataset, i, params) + self.get_fold_data(dataset, fold, params) - print(f'\nRunning fold {i}\nparams: {params}') - self.model = self.get_model(y_train_fold, x_train_fold, params) - cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, model, params) + print(f'\nRunning fold {fold}\nparams: {params}') + self.model = self.get_model(y_train_fold, x_train_fold, fold, params) + cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, self.model, params) print(f'cv_score: {cv_score}\n') for metric in self.metrics: @@ -159,7 +174,7 @@ def get_dataset(self, params) -> dict[str, np.matrix]: pass @abstractmethod - def get_model(self, y, x, params) -> linear.FlatModel | linear.TreeModel: + def get_model(self, y, x, fold, params) -> linear.FlatModel | linear.TreeModel: """ Get the model for the given params. @@ -180,6 +195,7 @@ def __init__(self, data_source, n_folds, search_space, config=None): self._cached_tfidf_params = None self._cached_tfidf_data = None self._cached_tree_params = None + self._cached_tree_roots = {fold: None for fold in range(self.n_folds)} # pass directly in the product code (linear_trainer.py) self.dataset = linear.load_dataset("svm", self.data_source[0], self.data_source[1]) self.num_instances = len(self.dataset["train"]["y"]) @@ -189,17 +205,33 @@ def get_dataset(self, params): if tfidf_params != self._cached_tfidf_params: print(f'Preprocessing tfidf: {tfidf_params}..') self._cached_tfidf_params = tfidf_params - self._cached_tfidf_data = linear.Preprocessor(tfidf_params=tfidf_params).fit_transform(self.dataset) + with silent_print(): + preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params)) + self._cached_tfidf_data = preprocessor.fit_transform(self.dataset)['train'] + return self._cached_tfidf_data def get_tree_root(self, y, x, params): - label_representation = (y.T * x).tocsr() - label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) - root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax) - root.is_root = True + with silent_print(): + label_representation = (y.T * x).tocsr() + label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) + root = _build_tree(label_representation, np.arange(y.shape[1]), 0, **params) + root.is_root = True + + return root + + def get_model(self, y, x, fold, params): + tree_params = params.tree + if tree_params != self._cached_tree_params: + self._cached_tree_params = tree_params + self._cached_tree_roots = {fold: None for fold in range(self.n_folds)} + + if self._cached_tree_roots[fold] is None: + print(f'Preprocessing tree: {tree_params} on fold {fold}..') + self._cached_tree_roots[fold] = self.get_tree_root(y, x, asdict(tree_params)) + + model = linear.train_tree(y, x, root=self._cached_tree_roots[fold], options=params.linear_options) - def get_model(self, y, x, params): - model = linear.train_tree(y, x, **params) # train with params and fold data return model diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index d2044abe..777838c1 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -204,7 +204,7 @@ def train_tree( K=DEFAULT_K, dmax=DEFAULT_DMAX, verbose: bool = True, - tree_root: Node = None, + root: Node = None, *args, **kwargs, ) -> TreeModel: """Train a linear model for multi-label data using a divide-and-conquer strategy. @@ -221,7 +221,7 @@ def train_tree( Returns: TreeModel: A model which can be used in predict_values. """ - if tree_root is None: + if root is None: label_representation = (y.T * x).tocsr() label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax) @@ -267,18 +267,6 @@ def visit(node): return TreeModel(root, flat_model, node_ptr) -import os, sys - -class silent_print: - def __enter__(self): - self._original_stdout = sys.stdout - sys.stdout = open(os.devnull, 'w') - - def __exit__(self, exc_type, exc_val, exc_tb): - sys.stdout.close() - sys.stdout = self._original_stdout - - def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, d: int, K: int, dmax: int) -> Node: """Build the tree recursively by kmeans clustering. @@ -302,11 +290,10 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, if True: metalabels = np.random.randint(0, K, label_representation.shape[0]) else: - with silent_print(): - kmeans = kmeans_algo( - n_clusters=K, max_iter=300, tol=0.0001, random_state=np.random.randint(2**31 - 1), verbose=False - ) - metalabels = kmeans.fit(label_representation) + kmeans = kmeans_algo( + n_clusters=K, max_iter=300, tol=0.0001, random_state=np.random.randint(2**31 - 1), verbose=False + ) + metalabels = kmeans.fit(label_representation) unique_labels = np.unique(metalabels) if len(unique_labels) == K: diff --git a/run_exp.py b/run_exp.py index c3c16257..be04591b 100644 --- a/run_exp.py +++ b/run_exp.py @@ -52,14 +52,9 @@ def run_tree(dataset, options, K, dmax, *args, **kwargs): 'K': [2, 100], }, } - search_space = { - 'tfidf': [ - {'max_features': i} for i in [10000] - ], - 'params': [ - {'K': i} for i in [2, 100] - ], - } + search_space = [ + {'max_features': i, 'K': j} for i in [10000] for j in [2, 100] + ] print(search_space) n_folds = 3 grid_search = grid.HyperparameterSearch(data_source, n_folds, search_space) From 214b28c4a720c80b15289a38249ec33805481e0b Mon Sep 17 00:00:00 2001 From: chcwww Date: Thu, 6 Nov 2025 05:46:12 +0000 Subject: [PATCH 04/23] update for demo --- grid.py | 146 +++++++++++++++++-------------- libmultilabel/linear/__init__.py | 2 +- libmultilabel/linear/linear.py | 2 +- libmultilabel/linear/tree.py | 3 +- linear_trainer.py | 13 ++- run_exp.py | 88 +++++-------------- 6 files changed, 120 insertions(+), 134 deletions(-) diff --git a/grid.py b/grid.py index beb79595..e8c50d65 100644 --- a/grid.py +++ b/grid.py @@ -1,25 +1,36 @@ -import os -import sys from abc import abstractmethod from dataclasses import make_dataclass, field, fields, asdict from typing import Callable +import os +import sys +import logging + import libmultilabel.linear as linear -from libmultilabel.linear.tree import _build_tree, silent_print +from libmultilabel.linear.tree import _build_tree import sklearn.preprocessing import numpy as np import math -class silent_print: +# suppress inevitable outputs from sparsekmeans and sklearn preprocessors +class _silent_: + def __init__(self): + self.stderr = os.dup(2) + self.devnull = os.open(os.devnull, os.O_WRONLY) + def __enter__(self): - self._original_stdout = sys.stdout + os.dup2(self.devnull, 2) + self.stdout = sys.stdout sys.stdout = open(os.devnull, 'w') - def __exit__(self, exc_type, exc_val, exc_tb): + def __exit__(self, type, value, traceback): + os.dup2(self.stderr, 2) + os.close(self.devnull) + os.close(self.stderr) sys.stdout.close() - sys.stdout = self._original_stdout + sys.stdout = self.stdout class GridParameter: @@ -47,10 +58,10 @@ class GridParameter: ] param_types = { - 'tfidf': make_dataclass('_TfidfParams', _tfidf_fields, frozen=True, order=True), - 'tree': make_dataclass('_TreeParams', _tree_fields, frozen=True, order=True), - 'linear': make_dataclass('_LinearParams', _linear_fields, frozen=True, order=True), - 'predict': make_dataclass('_PredictParams', _predict_fields, frozen=True, order=True), + 'tfidf': make_dataclass('TfidfParams', _tfidf_fields, frozen=True, order=True), + 'tree': make_dataclass('TreeParams', _tree_fields, frozen=True, order=True), + 'linear': make_dataclass('LinearParams', _linear_fields, frozen=True, order=True), + 'predict': make_dataclass('PredictParams', _predict_fields, frozen=True, order=True), } def __init__(self, params: dict): @@ -84,12 +95,20 @@ def __hash__(self): class GridSearch: - def __init__(self, data_source: tuple[str, str], n_folds: int, search_space: list[dict], config=None): - self.data_source = data_source + def __init__( + self, + datasets: dict[str, np.matrix], + n_folds: int, + search_space: list[dict], + metrics: list[str], + ): + self.datasets = datasets self.search_space = [GridParameter(params) for params in search_space] - self.config = config self.n_folds = n_folds - self.metrics = ["P@1", "P@3", "P@5"] + self.metrics = metrics + self.results = { + params: {metric: 0 for metric in self.metrics} for params in self.search_space + } def sort_search_space(self): self.search_space.sort() @@ -108,13 +127,15 @@ def build_fold_idx(self): } for fold in range(self.n_folds) } - def get_fold_data(self, dataset, i, params): + def get_fold_data(self, dataset, fold, params): return ( - dataset["y"][self.fold_idx[i]['train']], dataset["x"][self.fold_idx[i]['train']], - dataset["y"][self.fold_idx[i]['valid']], dataset["x"][self.fold_idx[i]['valid']] + dataset["y"][self.fold_idx[fold]['train']], dataset["x"][self.fold_idx[fold]['train']], + dataset["y"][self.fold_idx[fold]['valid']], dataset["x"][self.fold_idx[fold]['valid']] ) def get_cv_score(self, y, x, model, params): + logging.info(f'Scoring params: {params.predict}') + batch_size = 256 num_instances = x.shape[0] num_batches = math.ceil(num_instances / batch_size) @@ -122,38 +143,32 @@ def get_cv_score(self, y, x, model, params): metrics = linear.get_metrics(self.metrics, num_classes=y.shape[1]) for i in range(num_batches): - preds = linear.predict_values(model, x[i * batch_size : (i + 1) * batch_size]) + preds = model.predict_values( + x[i * batch_size : (i + 1) * batch_size], + **asdict(params.predict)) target = y[i * batch_size : (i + 1) * batch_size].toarray() metrics.update(preds, target) - return metrics.compute() + scores = metrics.compute() + logging.info(f'cv_score: {scores}\n') + + return scores - def output(self): # return sorted params list with scores by default + def output(self): return sorted(self.results.items(), key=lambda x: x[1][self.metrics[0]], reverse=True) def __call__(self): self.sort_search_space() self.build_fold_idx() - self.results = { - params: {metric: 0 for metric in self.metrics} - for params in self.search_space - } - # for fold, params in zip(self.fold_space, self.search_space): - for params in self.search_space: # params should be an instance of a config class - avg_score = {metric: 0 for metric in self.metrics} + for params in self.search_space: dataset = self.get_dataset(params) - # should be 000111222... or 012012012... (for same tfidf params but different params) - # don't know whether 012012012 waste space (view or new data)? for fold in range(self.n_folds): - # secretly caching the tree root for each fold.. y_train_fold, x_train_fold, y_valid_fold, x_valid_fold = \ self.get_fold_data(dataset, fold, params) - print(f'\nRunning fold {fold}\nparams: {params}') - self.model = self.get_model(y_train_fold, x_train_fold, fold, params) - cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, self.model, params) - print(f'cv_score: {cv_score}\n') + model = self.get_model(y_train_fold, x_train_fold, fold, params) + cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, model, params) for metric in self.metrics: self.results[params][metric] += cv_score[metric] / self.n_folds @@ -190,29 +205,28 @@ def get_model(self, y, x, fold, params) -> linear.FlatModel | linear.TreeModel: class HyperparameterSearch(GridSearch): - def __init__(self, data_source, n_folds, search_space, config=None): - super().__init__(data_source, n_folds, search_space, config) + def __init__(self, datasets, n_folds, search_space, metrics=["P@1", "P@3", "P@5"]): + super().__init__(datasets, n_folds, search_space, metrics) self._cached_tfidf_params = None self._cached_tfidf_data = None self._cached_tree_params = None self._cached_tree_roots = {fold: None for fold in range(self.n_folds)} - # pass directly in the product code (linear_trainer.py) - self.dataset = linear.load_dataset("svm", self.data_source[0], self.data_source[1]) - self.num_instances = len(self.dataset["train"]["y"]) + + self.num_instances = len(self.datasets["train"]["y"]) def get_dataset(self, params): tfidf_params = params.tfidf if tfidf_params != self._cached_tfidf_params: - print(f'Preprocessing tfidf: {tfidf_params}..') + logging.info(f'Preprocessing tfidf: {tfidf_params}..') self._cached_tfidf_params = tfidf_params - with silent_print(): + with _silent_(): preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params)) - self._cached_tfidf_data = preprocessor.fit_transform(self.dataset)['train'] + self._cached_tfidf_data = preprocessor.fit_transform(self.datasets)['train'] return self._cached_tfidf_data def get_tree_root(self, y, x, params): - with silent_print(): + with _silent_(): label_representation = (y.T * x).tocsr() label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) root = _build_tree(label_representation, np.arange(y.shape[1]), 0, **params) @@ -221,13 +235,15 @@ def get_tree_root(self, y, x, params): return root def get_model(self, y, x, fold, params): + logging.info(f'\nRunning fold {fold}\nparams: {params}') + tree_params = params.tree if tree_params != self._cached_tree_params: self._cached_tree_params = tree_params self._cached_tree_roots = {fold: None for fold in range(self.n_folds)} if self._cached_tree_roots[fold] is None: - print(f'Preprocessing tree: {tree_params} on fold {fold}..') + logging.info(f'Preprocessing tree: {tree_params} on fold {fold}..') self._cached_tree_roots[fold] = self.get_tree_root(y, x, asdict(tree_params)) model = linear.train_tree(y, x, root=self._cached_tree_roots[fold], options=params.linear_options) @@ -235,28 +251,28 @@ def get_model(self, y, x, fold, params): return model -class ProbEstimatiteSearch(GridSearch): - def __init__(self, data_source, n_folds, search_space, config=None): - super().__init__(data_source, n_folds, search_space, config) +# class ProbEstimatiteSearch(GridSearch): +# def __init__(self, datasets, n_folds, search_space, config=None): +# super().__init__(datasets, n_folds, search_space, config) - def build_data(self): - data = {'unique': {}} - unique_data = None # from libmultilabel preprocessing - for i in range(self.n_folds): - train_idx, valid_idx = None, None - y_train_fold, x_train_fold = unique_data[train_idx] - y_valid_fold, x_valid_fold = unique_data[valid_idx] - data['unique'][i] = unique_data +# def build_data(self): +# data = {'unique': {}} +# unique_data = None # from libmultilabel preprocessing +# for i in range(self.n_folds): +# train_idx, valid_idx = None, None +# y_train_fold, x_train_fold = unique_data[train_idx] +# y_valid_fold, x_valid_fold = unique_data[valid_idx] +# data['unique'][i] = unique_data - return data +# return data - def get_fold_data(self, data, i, params): - return data['unique'][i] +# def get_fold_data(self, data, i, params): +# return data['unique'][i] - def get_model(self, y_train_fold, x_train_fold, params): - model = None # train normally with fold data - return model +# def get_model(self, y_train_fold, x_train_fold, params): +# model = None # train normally with fold data +# return model - def get_cv_score(self, y_valid_fold, x_valid_fold, model, params): - score = None # calculate the metric with the model and the hyperparameter A - return score +# def get_cv_score(self, y_valid_fold, x_valid_fold, model, params): +# score = None # calculate the metric with the model and the hyperparameter A +# return score diff --git a/libmultilabel/linear/__init__.py b/libmultilabel/linear/__init__.py index 7cdf30bb..efe24120 100644 --- a/libmultilabel/linear/__init__.py +++ b/libmultilabel/linear/__init__.py @@ -3,4 +3,4 @@ from .metrics import * from .preprocessor import * from .tree import * -from .utils import * +from .utils import * \ No newline at end of file diff --git a/libmultilabel/linear/linear.py b/libmultilabel/linear/linear.py index 6a47800a..d70620bc 100644 --- a/libmultilabel/linear/linear.py +++ b/libmultilabel/linear/linear.py @@ -44,7 +44,7 @@ def __init__( self.thresholds = thresholds self.multiclass = multiclass - def predict_values(self, x: sparse.csr_matrix) -> np.ndarray: + def predict_values(self, x: sparse.csr_matrix, *args, **kwargs) -> np.ndarray: """Calculate the decision values associated with x. Args: diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 777838c1..288e3061 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -62,6 +62,7 @@ def predict_values( self, x: sparse.csr_matrix, beam_width: int = 10, + *args, **kwargs, ) -> np.ndarray: """Calculate the probability estimates associated with x. @@ -287,7 +288,7 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, else: kmeans_algo = LloydKmeans - if True: + if False: metalabels = np.random.randint(0, K, label_representation.shape[0]) else: kmeans = kmeans_algo( diff --git a/linear_trainer.py b/linear_trainer.py index 1105f62a..49cc6401 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -44,8 +44,17 @@ def linear_train(datasets, config): # detect task type multiclass = is_multiclass_dataset(datasets["train"], "y") - # train - # 1 + do_grid = False + if do_grid: + search_space = [ + {'max_features': i, 'K': j, 'min_df': k, 'c': l} + for i in [10000, 20000] for j in [10, 100] for k in [1, 2] for l in [0.1, 0.2] + ] + n_folds = 3 + grid_search = linear.HyperparameterSearch(datasets, n_folds, search_space) + results = grid_search() + best_params = results[0] + if config.linear_technique == "tree": if multiclass: raise ValueError("Tree model should only be used with multilabel datasets.") diff --git a/run_exp.py b/run_exp.py index be04591b..84f44328 100644 --- a/run_exp.py +++ b/run_exp.py @@ -7,31 +7,11 @@ from tqdm import tqdm -def run_ovr(dataset, options, *args, **kwargs): - training_start = time.time() - ovr_model = linear.train_1vsrest( - dataset["train"]["y"], - dataset["train"]["x"], - options=options - ) - training_time = time.time() - training_start - return ovr_model, training_time - -def run_tree(dataset, options, K, dmax, *args, **kwargs): - training_start = time.time() - tree_model = linear.train_tree( - dataset["train"]["y"], - dataset["train"]["x"], - options=options, - K=K, - dmax=dmax - ) - training_time = time.time() - training_start - return tree_model, training_time - - if __name__ == "__main__": import argparse + import logging + + logging.basicConfig(level=logging.INFO) np.random.seed(20250820) parser = argparse.ArgumentParser(description="Parse command-line arguments.") @@ -40,49 +20,29 @@ def run_tree(dataset, options, K, dmax, *args, **kwargs): dataset_ = args.dataset - # dataset = linear.load_dataset("svm", f"data/{dataset_}/train.svm") # , f"data/{dataset}/test.svm" - data_source = [f'data/{dataset_}/train.svm', f'data/{dataset_}/test.svm'] - search_space = { - 'tfidf': { - 'min_df': [1, 2], - 'max_features': [10000, 320000], - }, - 'params': { - 'C': [1, 2], - 'K': [2, 100], - }, - } + datasets = linear.load_dataset("svm", f"data/{dataset_}/train.svm") # , f"data/{dataset}/test.svm" + # data_source = [f'data/{dataset_}/train.svm', f'data/{dataset_}/test.svm'] + # search_space = { + # 'tfidf': { + # 'min_df': [1, 2], + # 'max_features': [10000, 320000], + # }, + # 'params': { + # 'C': [1, 2], + # 'K': [2, 100], + # }, + # } search_space = [ - {'max_features': i, 'K': j} for i in [10000] for j in [2, 100] + {'max_features': i, 'K': j, 'min_df': k, 'c': l} + for i in [10000, 20000] for j in [10, 100] for k in [1, 2] for l in [0.1, 0.2] ] - print(search_space) - n_folds = 3 - grid_search = grid.HyperparameterSearch(data_source, n_folds, search_space) - results = grid_search() - print(results) - # if num_classes != -1: - # dataset["train"]["y"] = [[yij % num_classes for yij in yi] for yi in dataset["train"]["y"]] - - # preprocessor = linear.Preprocessor() - # dataset = preprocessor.fit_transform(dataset) - # results = { - # exp_name: { - # t: 0 for t in exp_threads - # } - # for exp_name in exp_names - # } + for i in search_space: + print(i) - # for exp_name in exp_names: - # for exp_thread in tqdm(exp_threads, leave=True, colour="blue", desc=exp_name): - # if exp_name == 'Strategy B': - # do_parallel = True - # options = "-m 1" - # num_threads = exp_thread - # else: - # do_parallel = False - # options = f"-m {exp_thread}" - # num_threads = -1 + n_folds = 3 + grid_search = grid.HyperparameterSearch(datasets, n_folds, search_space) + results = grid_search() - # _, training_time = run_ovr(dataset, options, num_threads, do_parallel, use_dedicated_x) - # results[exp_name][exp_thread] = training_time + for i in results: + print(i) From 6e033e85b14a1e237ba3e7cb2d0e75551bdf517e Mon Sep 17 00:00:00 2001 From: chcwww Date: Thu, 8 Jan 2026 06:23:46 +0000 Subject: [PATCH 05/23] reset linear_trainer.py to master --- linear_trainer.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/linear_trainer.py b/linear_trainer.py index 49cc6401..b9133857 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -44,17 +44,7 @@ def linear_train(datasets, config): # detect task type multiclass = is_multiclass_dataset(datasets["train"], "y") - do_grid = False - if do_grid: - search_space = [ - {'max_features': i, 'K': j, 'min_df': k, 'c': l} - for i in [10000, 20000] for j in [10, 100] for k in [1, 2] for l in [0.1, 0.2] - ] - n_folds = 3 - grid_search = linear.HyperparameterSearch(datasets, n_folds, search_space) - results = grid_search() - best_params = results[0] - + # train if config.linear_technique == "tree": if multiclass: raise ValueError("Tree model should only be used with multilabel datasets.") @@ -84,7 +74,6 @@ def linear_train(datasets, config): multiclass=multiclass, options=config.liblinear_options, ) - # 2 return model From 11b6a8369111c694b1a92735ddcbf705855a7921 Mon Sep 17 00:00:00 2001 From: chcwww Date: Thu, 8 Jan 2026 07:01:29 +0000 Subject: [PATCH 06/23] merge hyperparametersearch into gridsearch --- grid.py | 190 +++++++++++++++++++++++--------------------------------- 1 file changed, 77 insertions(+), 113 deletions(-) diff --git a/grid.py b/grid.py index e8c50d65..bd54fde4 100644 --- a/grid.py +++ b/grid.py @@ -1,4 +1,3 @@ -from abc import abstractmethod from dataclasses import make_dataclass, field, fields, asdict from typing import Callable @@ -15,7 +14,7 @@ # suppress inevitable outputs from sparsekmeans and sklearn preprocessors -class _silent_: +class __silent__: def __init__(self): self.stderr = os.dup(2) self.devnull = os.open(os.devnull, os.O_WRONLY) @@ -51,6 +50,7 @@ class GridParameter: ('s', int, field(default=1)), ('c', float, field(default=1)), ('B', int, field(default=-1)), + ('alpha', float, field(default=1)) ] _predict_fields = [ ('beam_width', int, field(default=10)), @@ -98,17 +98,23 @@ class GridSearch: def __init__( self, datasets: dict[str, np.matrix], - n_folds: int, - search_space: list[dict], - metrics: list[str], + n_folds: int = 3, + metrics: list[str] = ["P@1", "P@3", "P@5"], ): self.datasets = datasets - self.search_space = [GridParameter(params) for params in search_space] self.n_folds = n_folds self.metrics = metrics - self.results = { - params: {metric: 0 for metric in self.metrics} for params in self.search_space - } + + self._cached_tfidf_params = None + self._cached_tfidf_data = None + self._cached_tree_params = None + self._cached_tree_roots = {fold: None for fold in range(self.n_folds)} + + self.num_instances = len(self.datasets["train"]["y"]) + + def init_tfidf_cache(self, datasets, params): + self._cached_tfidf_params = params.tfidf + self._cached_tfidf_data = datasets def sort_search_space(self): self.search_space.sort() @@ -127,56 +133,7 @@ def build_fold_idx(self): } for fold in range(self.n_folds) } - def get_fold_data(self, dataset, fold, params): - return ( - dataset["y"][self.fold_idx[fold]['train']], dataset["x"][self.fold_idx[fold]['train']], - dataset["y"][self.fold_idx[fold]['valid']], dataset["x"][self.fold_idx[fold]['valid']] - ) - - def get_cv_score(self, y, x, model, params): - logging.info(f'Scoring params: {params.predict}') - - batch_size = 256 - num_instances = x.shape[0] - num_batches = math.ceil(num_instances / batch_size) - - metrics = linear.get_metrics(self.metrics, num_classes=y.shape[1]) - - for i in range(num_batches): - preds = model.predict_values( - x[i * batch_size : (i + 1) * batch_size], - **asdict(params.predict)) - target = y[i * batch_size : (i + 1) * batch_size].toarray() - metrics.update(preds, target) - - scores = metrics.compute() - logging.info(f'cv_score: {scores}\n') - - return scores - - def output(self): - return sorted(self.results.items(), key=lambda x: x[1][self.metrics[0]], reverse=True) - - def __call__(self): - self.sort_search_space() - self.build_fold_idx() - - for params in self.search_space: - dataset = self.get_dataset(params) - for fold in range(self.n_folds): - y_train_fold, x_train_fold, y_valid_fold, x_valid_fold = \ - self.get_fold_data(dataset, fold, params) - - model = self.get_model(y_train_fold, x_train_fold, fold, params) - cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, model, params) - - for metric in self.metrics: - self.results[params][metric] += cv_score[metric] / self.n_folds - - return self.output() - - @abstractmethod - def get_dataset(self, params) -> dict[str, np.matrix]: + def get_dataset(self, params): """ Get the dataset for the given params. @@ -186,55 +143,43 @@ def get_dataset(self, params) -> dict[str, np.matrix]: Returns: dict[str, np.matrix]: The keys should be 'y' and 'x'. """ - pass - - @abstractmethod - def get_model(self, y, x, fold, params) -> linear.FlatModel | linear.TreeModel: - """ - Get the model for the given params. - - Args: - y (np.matrix): The labels of the training data. - x (np.matrix): The features of the training data. - params (GridParameter): The params to build the model. - - Returns: - linear.FlatModel | linear.TreeModel: The model for the given params. - """ - pass - - -class HyperparameterSearch(GridSearch): - def __init__(self, datasets, n_folds, search_space, metrics=["P@1", "P@3", "P@5"]): - super().__init__(datasets, n_folds, search_space, metrics) - self._cached_tfidf_params = None - self._cached_tfidf_data = None - self._cached_tree_params = None - self._cached_tree_roots = {fold: None for fold in range(self.n_folds)} - - self.num_instances = len(self.datasets["train"]["y"]) - - def get_dataset(self, params): tfidf_params = params.tfidf if tfidf_params != self._cached_tfidf_params: logging.info(f'Preprocessing tfidf: {tfidf_params}..') self._cached_tfidf_params = tfidf_params - with _silent_(): + with __silent__(): preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params)) self._cached_tfidf_data = preprocessor.fit_transform(self.datasets)['train'] return self._cached_tfidf_data - def get_tree_root(self, y, x, params): - with _silent_(): + def get_fold_data(self, dataset, fold): + return ( + dataset["y"][self.fold_idx[fold]['train']], dataset["x"][self.fold_idx[fold]['train']], + dataset["y"][self.fold_idx[fold]['valid']], dataset["x"][self.fold_idx[fold]['valid']] + ) + + def get_tree_root(self, y, x, tree_params): + with __silent__(): label_representation = (y.T * x).tocsr() label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) - root = _build_tree(label_representation, np.arange(y.shape[1]), 0, **params) + root = _build_tree(label_representation, np.arange(y.shape[1]), 0, **asdict(tree_params)) root.is_root = True return root def get_model(self, y, x, fold, params): + """ + Get the model for the given params. + + Args: + y (np.matrix): The labels of the training data. + x (np.matrix): The features of the training data. + params (GridParameter): The params to build the model. + + Returns: + linear.FlatModel | linear.TreeModel: The model for the given params. + """ logging.info(f'\nRunning fold {fold}\nparams: {params}') tree_params = params.tree @@ -244,35 +189,54 @@ def get_model(self, y, x, fold, params): if self._cached_tree_roots[fold] is None: logging.info(f'Preprocessing tree: {tree_params} on fold {fold}..') - self._cached_tree_roots[fold] = self.get_tree_root(y, x, asdict(tree_params)) + self._cached_tree_roots[fold] = self.get_tree_root(y, x, tree_params) model = linear.train_tree(y, x, root=self._cached_tree_roots[fold], options=params.linear_options) return model + def get_cv_score(self, y, x, model, params): + logging.info(f'Scoring params: {params.predict}') -# class ProbEstimatiteSearch(GridSearch): -# def __init__(self, datasets, n_folds, search_space, config=None): -# super().__init__(datasets, n_folds, search_space, config) + batch_size = 256 + num_instances = x.shape[0] + num_batches = math.ceil(num_instances / batch_size) -# def build_data(self): -# data = {'unique': {}} -# unique_data = None # from libmultilabel preprocessing -# for i in range(self.n_folds): -# train_idx, valid_idx = None, None -# y_train_fold, x_train_fold = unique_data[train_idx] -# y_valid_fold, x_valid_fold = unique_data[valid_idx] -# data['unique'][i] = unique_data + metrics = linear.get_metrics(self.metrics, num_classes=y.shape[1]) -# return data + for i in range(num_batches): + preds = model.predict_values( + x[i * batch_size : (i + 1) * batch_size], + **asdict(params.predict)) + target = y[i * batch_size : (i + 1) * batch_size].toarray() + metrics.update(preds, target) -# def get_fold_data(self, data, i, params): -# return data['unique'][i] + scores = metrics.compute() + logging.info(f'cv_score: {scores}\n') -# def get_model(self, y_train_fold, x_train_fold, params): -# model = None # train normally with fold data -# return model + return scores + + def output(self): + return sorted(self.results.items(), key=lambda x: x[1][self.metrics[0]], reverse=True) + + def __call__(self, search_space): + self.search_space = [GridParameter(params) for params in search_space] + self.sort_search_space() + self.build_fold_idx() -# def get_cv_score(self, y_valid_fold, x_valid_fold, model, params): -# score = None # calculate the metric with the model and the hyperparameter A -# return score + self.results = { + params: {metric: 0 for metric in self.metrics} for params in self.search_space + } + for params in self.search_space: + dataset = self.get_dataset(params) + for fold in range(self.n_folds): + y_train_fold, x_train_fold, y_valid_fold, x_valid_fold = \ + self.get_fold_data(dataset, fold, params) + + model = self.get_model(y_train_fold, x_train_fold, fold, params) + cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, model, params) + + for metric in self.metrics: + self.results[params][metric] += cv_score[metric] / self.n_folds + + return self.output() From 51f69caa546fbb04cd80fdef0f3b40e0eed0d1b3 Mon Sep 17 00:00:00 2001 From: chcwww Date: Thu, 8 Jan 2026 07:01:59 +0000 Subject: [PATCH 07/23] update examples with pruning and prob estimate --- run_exp.py | 52 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/run_exp.py b/run_exp.py index 84f44328..cdb6a2a3 100644 --- a/run_exp.py +++ b/run_exp.py @@ -1,10 +1,16 @@ import libmultilabel.linear as linear import grid as grid import numpy as np +from dataclasses import asdict import time import json from tqdm import tqdm +import itertools + + +def prune_model(*args, **kwargs): + pass if __name__ == "__main__": @@ -32,17 +38,49 @@ # 'K': [2, 100], # }, # } + n_folds = 3 + retrain = True + linear_technique = 'tree' + search_space_dict = { + 'max_features': [10000, 20000], + 'K': [10, 100], + 'min_df': [1, 2], + 'c': [0.1, 0.2], + } + param_names = search_space_dict.keys() search_space = [ - {'max_features': i, 'K': j, 'min_df': k, 'c': l} - for i in [10000, 20000] for j in [10, 100] for k in [1, 2] for l in [0.1, 0.2] + dict(zip(param_names, param_values)) + for param_values in itertools.product(*search_space_dict.values()) ] + # search_space = [dict()] # all default values + + # search_space = [ + # {'max_features': i, 'K': j, 'min_df': k, 'c': l} + # for i in [10000, 20000] for j in [10, 100] for k in [1, 2] for l in [0.1, 0.2] + # ] for i in search_space: print(i) - n_folds = 3 - grid_search = grid.HyperparameterSearch(datasets, n_folds, search_space) - results = grid_search() + search = linear.GridSearch(datasets, n_folds) + best_params = search(['hyper'])[0] - for i in results: - print(i) + if best_params.tfidf == search._cached_tfidf_params: + datasets = search._cached_tfidf_data + else: + preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf)) + datasets = preprocessor.fit_transform(datasets) + search.init_tfidf_cache(datasets, best_params) + + best_alpha = search(['alpha'])[0] + best_A = search(['A'])[0] + # TODO (the fields are frozen) + best_params.linear.alpha = best_alpha + best_params.linear.A = best_A + + if retrain: + model = linear.LINEAR_TECHNIQUES[linear_technique]( + datasets["train"]["y"], + datasets["train"]["x"], + **asdict(best_params.linear), + ) From b15878af70909dc47022803922e66533ddc13572 Mon Sep 17 00:00:00 2001 From: chcwww Date: Mon, 12 Jan 2026 09:30:44 +0000 Subject: [PATCH 08/23] update cache logic for search_space --- grid.py | 99 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 56 insertions(+), 43 deletions(-) diff --git a/grid.py b/grid.py index bd54fde4..e667ec3a 100644 --- a/grid.py +++ b/grid.py @@ -50,26 +50,28 @@ class GridParameter: ('s', int, field(default=1)), ('c', float, field(default=1)), ('B', int, field(default=-1)), - ('alpha', float, field(default=1)) + ('alpha', float, field(default=1)), ] _predict_fields = [ ('beam_width', int, field(default=10)), ('A', int, field(default=1)), ] - param_types = { + _param_types = { 'tfidf': make_dataclass('TfidfParams', _tfidf_fields, frozen=True, order=True), 'tree': make_dataclass('TreeParams', _tree_fields, frozen=True, order=True), 'linear': make_dataclass('LinearParams', _linear_fields, frozen=True, order=True), 'predict': make_dataclass('PredictParams', _predict_fields, frozen=True, order=True), } - def __init__(self, params: dict): + def __init__(self, params: dict, fold: int = -1): self.params = params - for param_type, class_name in self.param_types.items(): + for param_type, class_name in self._param_types.items(): field_names = {f.name for f in fields(class_name)} _params = {k: v for k, v in self.params.items() if k in field_names} setattr(self, param_type, class_name(**_params)) + self.param_types = dict(self._param_types, fold=-1) + self.fold = fold @property def linear_options(self): @@ -105,10 +107,13 @@ def __init__( self.n_folds = n_folds self.metrics = metrics - self._cached_tfidf_params = None + self._cached_params = GridParameter() + for param_type in self._cached_params.param_types: + self._cached_params[param_type] = None self._cached_tfidf_data = None - self._cached_tree_params = None - self._cached_tree_roots = {fold: None for fold in range(self.n_folds)} + self._cached_tree_root = None + self._cached_fold_data = None + self._cached_model = None self.num_instances = len(self.datasets["train"]["y"]) @@ -144,31 +149,42 @@ def get_dataset(self, params): dict[str, np.matrix]: The keys should be 'y' and 'x'. """ tfidf_params = params.tfidf - if tfidf_params != self._cached_tfidf_params: + if tfidf_params != self._cached_params.tfidf: logging.info(f'Preprocessing tfidf: {tfidf_params}..') - self._cached_tfidf_params = tfidf_params with __silent__(): preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params)) + self._cached_params.tfidf = tfidf_params self._cached_tfidf_data = preprocessor.fit_transform(self.datasets)['train'] return self._cached_tfidf_data - def get_fold_data(self, dataset, fold): - return ( - dataset["y"][self.fold_idx[fold]['train']], dataset["x"][self.fold_idx[fold]['train']], - dataset["y"][self.fold_idx[fold]['valid']], dataset["x"][self.fold_idx[fold]['valid']] - ) + def get_fold_data(self, dataset, params): + fold = params.fold + if params.tfidf != self._cached_params.tfidf or fold != self._cached_params.fold: + logging.info(f'Preprocessing fold: {fold} for tfidf: {params.tfidf}..') + self._cached_params.fold = fold + self._cached_fold_data = ( + dataset["y"][self.fold_idx[fold]['train']], dataset["x"][self.fold_idx[fold]['train']], + dataset["y"][self.fold_idx[fold]['valid']], dataset["x"][self.fold_idx[fold]['valid']] + ) - def get_tree_root(self, y, x, tree_params): - with __silent__(): - label_representation = (y.T * x).tocsr() - label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) - root = _build_tree(label_representation, np.arange(y.shape[1]), 0, **asdict(tree_params)) - root.is_root = True + return self._cached_fold_data - return root + def get_tree_root(self, y, x, params): + tree_params = params.tree + if params.tfidf != self._cached_params.tfidf or tree_params != self._cached_params.tree or \ + params.fold != self._cached_params.fold: + logging.info(f'Preprocessing tree: {tree_params} on fold {params.fold} for tfidf: {params.tfidf}..') + with __silent__(): + label_representation = (y.T * x).tocsr() + label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) + self._cached_params.tree = tree_params + self._cached_tree_root = _build_tree(label_representation, np.arange(y.shape[1]), 0, **asdict(tree_params)) + self._cached_tree_root.is_root = True + + return self._cached_tree_root - def get_model(self, y, x, fold, params): + def get_model(self, y, x, params): """ Get the model for the given params. @@ -180,20 +196,17 @@ def get_model(self, y, x, fold, params): Returns: linear.FlatModel | linear.TreeModel: The model for the given params. """ - logging.info(f'\nRunning fold {fold}\nparams: {params}') - - tree_params = params.tree - if tree_params != self._cached_tree_params: - self._cached_tree_params = tree_params - self._cached_tree_roots = {fold: None for fold in range(self.n_folds)} - - if self._cached_tree_roots[fold] is None: - logging.info(f'Preprocessing tree: {tree_params} on fold {fold}..') - self._cached_tree_roots[fold] = self.get_tree_root(y, x, tree_params) + logging.info(f'\nRunning fold {params.fold}\nparams: {params}') - model = linear.train_tree(y, x, root=self._cached_tree_roots[fold], options=params.linear_options) + linear_params = params.linear + if params.tfidf != self._cached_params.tfidf or params.tree != self._cached_params.tree or \ + linear_params != self._cached_params.linear or params.fold != self._cached_params.fold: + logging.info(f'Preprocessing linear: {linear_params}, tree: {params.tree} on fold {params.fold} for tfidf: {params.tfidf}..') + root = self.get_tree_root(y, x, params) + self._cached_params.linear = linear_params + self._cached_model = linear.train_tree(y, x, root=root, options=params.linear_options) - return model + return self._cached_model def get_cv_score(self, y, x, model, params): logging.info(f'Scoring params: {params.predict}') @@ -220,23 +233,23 @@ def output(self): return sorted(self.results.items(), key=lambda x: x[1][self.metrics[0]], reverse=True) def __call__(self, search_space): - self.search_space = [GridParameter(params) for params in search_space] + self.search_space = [GridParameter(params, fold) for params in search_space for fold in range(self.n_folds)] self.sort_search_space() self.build_fold_idx() self.results = { - params: {metric: 0 for metric in self.metrics} for params in self.search_space + GridParameter(params): {metric: 0 for metric in self.metrics} for params in search_space } for params in self.search_space: dataset = self.get_dataset(params) - for fold in range(self.n_folds): - y_train_fold, x_train_fold, y_valid_fold, x_valid_fold = \ - self.get_fold_data(dataset, fold, params) + y_train_fold, x_train_fold, y_valid_fold, x_valid_fold = \ + self.get_fold_data(dataset, params) - model = self.get_model(y_train_fold, x_train_fold, fold, params) - cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, model, params) + model = self.get_model(y_train_fold, x_train_fold, params) + cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, model, params) - for metric in self.metrics: - self.results[params][metric] += cv_score[metric] / self.n_folds + params.fold = -1 + for metric in self.metrics: + self.results[params][metric] += cv_score[metric] / self.n_folds return self.output() From 7300a5fdd16fc6438c8c62674006cf94dfa82e5a Mon Sep 17 00:00:00 2001 From: chcwww Date: Mon, 19 Jan 2026 18:07:44 +0000 Subject: [PATCH 09/23] update better sorting logic --- grid.py | 28 ++++++++++++++--------- libmultilabel/linear/__init__.py | 2 +- run_exp.py | 38 ++++++++++++++++++-------------- 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/grid.py b/grid.py index e667ec3a..c4b89f50 100644 --- a/grid.py +++ b/grid.py @@ -50,28 +50,34 @@ class GridParameter: ('s', int, field(default=1)), ('c', float, field(default=1)), ('B', int, field(default=-1)), - ('alpha', float, field(default=1)), + # ('alpha', float, field(default=1)), ] _predict_fields = [ ('beam_width', int, field(default=10)), ('A', int, field(default=1)), ] - _param_types = { + param_types = { 'tfidf': make_dataclass('TfidfParams', _tfidf_fields, frozen=True, order=True), + 'fold': lambda fold: fold, 'tree': make_dataclass('TreeParams', _tree_fields, frozen=True, order=True), 'linear': make_dataclass('LinearParams', _linear_fields, frozen=True, order=True), 'predict': make_dataclass('PredictParams', _predict_fields, frozen=True, order=True), } - def __init__(self, params: dict, fold: int = -1): - self.params = params - for param_type, class_name in self._param_types.items(): - field_names = {f.name for f in fields(class_name)} - _params = {k: v for k, v in self.params.items() if k in field_names} - setattr(self, param_type, class_name(**_params)) - self.param_types = dict(self._param_types, fold=-1) - self.fold = fold + def __init__(self, params: dict | None = None, fold: int = -1): + self.params = params or {} + + params_set = set(self.params) + for param_type, class_name in self.param_types.items(): + if param_type == 'fold': + filtered_params = {'fold': fold} + else: + field_names = {f.name for f in fields(class_name)} + filtered_keys = params_set & field_names + params_set -= field_names + filtered_params = {k: self.params[k] for k in filtered_keys} + setattr(self, param_type, class_name(**filtered_params)) @property def linear_options(self): @@ -109,7 +115,7 @@ def __init__( self._cached_params = GridParameter() for param_type in self._cached_params.param_types: - self._cached_params[param_type] = None + setattr(self._cached_params, param_type, None) self._cached_tfidf_data = None self._cached_tree_root = None self._cached_fold_data = None diff --git a/libmultilabel/linear/__init__.py b/libmultilabel/linear/__init__.py index efe24120..7cdf30bb 100644 --- a/libmultilabel/linear/__init__.py +++ b/libmultilabel/linear/__init__.py @@ -3,4 +3,4 @@ from .metrics import * from .preprocessor import * from .tree import * -from .utils import * \ No newline at end of file +from .utils import * diff --git a/run_exp.py b/run_exp.py index cdb6a2a3..b359b8a7 100644 --- a/run_exp.py +++ b/run_exp.py @@ -1,5 +1,6 @@ import libmultilabel.linear as linear -import grid as grid +import grid + import numpy as np from dataclasses import asdict @@ -39,12 +40,13 @@ def prune_model(*args, **kwargs): # }, # } n_folds = 3 - retrain = True + retrain = False linear_technique = 'tree' search_space_dict = { 'max_features': [10000, 20000], 'K': [10, 100], 'min_df': [1, 2], + 'A': [2, 3], 'c': [0.1, 0.2], } param_names = search_space_dict.keys() @@ -62,21 +64,23 @@ def prune_model(*args, **kwargs): for i in search_space: print(i) - search = linear.GridSearch(datasets, n_folds) - best_params = search(['hyper'])[0] - - if best_params.tfidf == search._cached_tfidf_params: - datasets = search._cached_tfidf_data - else: - preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf)) - datasets = preprocessor.fit_transform(datasets) - search.init_tfidf_cache(datasets, best_params) - - best_alpha = search(['alpha'])[0] - best_A = search(['A'])[0] - # TODO (the fields are frozen) - best_params.linear.alpha = best_alpha - best_params.linear.A = best_A + search = grid.GridSearch(datasets, n_folds) + best_params = search(search_space) + print(best_params) + breakpoint() + + # if best_params.tfidf == search._cached_tfidf_params: + # datasets = search._cached_tfidf_data + # else: + # preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf)) + # datasets = preprocessor.fit_transform(datasets) + # search.init_tfidf_cache(datasets, best_params) + + # best_alpha = search(['alpha'])[0] + # best_A = search(['A'])[0] + # # TODO (the fields are frozen) + # best_params.linear.alpha = best_alpha + # best_params.linear.A = best_A if retrain: model = linear.LINEAR_TECHNIQUES[linear_technique]( From 9412d8cc8cfc95114d7a812894708928b2867279 Mon Sep 17 00:00:00 2001 From: chcwww Date: Mon, 19 Jan 2026 18:08:07 +0000 Subject: [PATCH 10/23] update the parameter A --- libmultilabel/linear/linear.py | 3 +-- libmultilabel/linear/tree.py | 22 +++++++++++++--------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/libmultilabel/linear/linear.py b/libmultilabel/linear/linear.py index d70620bc..04d25a21 100644 --- a/libmultilabel/linear/linear.py +++ b/libmultilabel/linear/linear.py @@ -44,7 +44,7 @@ def __init__( self.thresholds = thresholds self.multiclass = multiclass - def predict_values(self, x: sparse.csr_matrix, *args, **kwargs) -> np.ndarray: + def predict_values(self, x: sparse.csr_matrix) -> np.ndarray: """Calculate the decision values associated with x. Args: @@ -198,7 +198,6 @@ def train_1vsrest( multiclass: bool = False, options: str = "", verbose: bool = True, - *args, **kwargs, ) -> FlatModel: """Train a linear model parallel on labels for multi-label data using a one-vs-rest strategy. diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 288e3061..96681cbd 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -4,6 +4,7 @@ import numpy as np import scipy.sparse as sparse +from scipy.special import log_expit from sparsekmeans import LloydKmeans, ElkanKmeans import sklearn.preprocessing from tqdm import tqdm @@ -62,7 +63,7 @@ def predict_values( self, x: sparse.csr_matrix, beam_width: int = 10, - *args, **kwargs, + A: int = 3, ) -> np.ndarray: """Calculate the probability estimates associated with x. @@ -73,6 +74,7 @@ def predict_values( Returns: np.ndarray: A matrix with dimension number of instances * number of classes. """ + sigmoid_A = lambda x: log_expit(A * x) if beam_width >= len(self.root.children): # Beam_width is sufficiently large; pruning not applied. # Calculates decision values for all nodes. @@ -82,8 +84,8 @@ def predict_values( if not self._model_separated: self._separate_model_for_pruning_tree() self._model_separated = True - all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels) - return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])]) + all_preds = self._prune_tree_and_predict_values(x, beam_width, sigmoid_A) # number of instances * (number of labels + total number of metalabels) + return np.vstack([self._beam_search(all_preds[i], beam_width, sigmoid_A) for i in range(all_preds.shape[0])]) def _separate_model_for_pruning_tree(self): """ @@ -114,7 +116,7 @@ def _separate_model_for_pruning_tree(self): ) self.subtree_models.append(subtree_flatmodel) - def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray: + def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, sigmoid_A: Callable) -> np.ndarray: """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees. Only subtrees corresponding to the top beam_width candidates from the root are evaluated, @@ -133,7 +135,8 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) # Calculate root decision values and scores root_preds = linear.predict_values(self.root_model, x) - children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds)) + # children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds)) + children_scores = 0.0 + self.sigmoid_A(root_preds) slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] all_preds[slice] = root_preds @@ -160,7 +163,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) return all_preds - def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarray: + def _beam_search(self, instance_preds: np.ndarray, beam_width: int, sigmoid_A: Callable) -> np.ndarray: """Predict with beam search using cached probability estimates for a single instance. Args: @@ -183,7 +186,8 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra continue slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - children_score = score - np.square(np.maximum(0, 1 - pred)) + # children_score = score - np.square(np.maximum(0, 1 - pred)) + children_score = score + self.sigmoid_A(pred) next_level.extend(zip(node.children, children_score.tolist())) cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width] @@ -194,7 +198,8 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred))) + # scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred))) + scores[node.label_map] = np.exp(score + self.sigmoid_A(pred)) return scores @@ -206,7 +211,6 @@ def train_tree( dmax=DEFAULT_DMAX, verbose: bool = True, root: Node = None, - *args, **kwargs, ) -> TreeModel: """Train a linear model for multi-label data using a divide-and-conquer strategy. The algorithm used is based on https://github.com/xmc-aalto/bonsai. From ba3d1a3ce5b757b79dcc801c397a1f71fad04e36 Mon Sep 17 00:00:00 2001 From: chcwww Date: Mon, 19 Jan 2026 18:11:18 +0000 Subject: [PATCH 11/23] fix bug for the parameter A --- libmultilabel/linear/tree.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 96681cbd..87e7242e 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -136,7 +136,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, # Calculate root decision values and scores root_preds = linear.predict_values(self.root_model, x) # children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds)) - children_scores = 0.0 + self.sigmoid_A(root_preds) + children_scores = 0.0 + sigmoid_A(root_preds) slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] all_preds[slice] = root_preds @@ -187,7 +187,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int, sigmoid_A: C slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] # children_score = score - np.square(np.maximum(0, 1 - pred)) - children_score = score + self.sigmoid_A(pred) + children_score = score + sigmoid_A(pred) next_level.extend(zip(node.children, children_score.tolist())) cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width] @@ -199,7 +199,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int, sigmoid_A: C slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] # scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred))) - scores[node.label_map] = np.exp(score + self.sigmoid_A(pred)) + scores[node.label_map] = np.exp(score + sigmoid_A(pred)) return scores From 7c3abb8c3c87068ae73d4c2e1846fa46eb139a9d Mon Sep 17 00:00:00 2001 From: chcwww Date: Thu, 22 Jan 2026 15:48:24 +0000 Subject: [PATCH 12/23] use self.no_cache to control the cache --- grid.py | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/grid.py b/grid.py index c4b89f50..937bb68c 100644 --- a/grid.py +++ b/grid.py @@ -120,12 +120,13 @@ def __init__( self._cached_tree_root = None self._cached_fold_data = None self._cached_model = None + self.no_cache = True self.num_instances = len(self.datasets["train"]["y"]) - def init_tfidf_cache(self, datasets, params): - self._cached_tfidf_params = params.tfidf - self._cached_tfidf_data = datasets + # def init_tfidf_cache(self, datasets, params): + # self._cached_tfidf_params = params.tfidf + # self._cached_tfidf_data = datasets def sort_search_space(self): self.search_space.sort() @@ -155,7 +156,8 @@ def get_dataset(self, params): dict[str, np.matrix]: The keys should be 'y' and 'x'. """ tfidf_params = params.tfidf - if tfidf_params != self._cached_params.tfidf: + self.no_cache = (tfidf_params != self._cached_params.tfidf) + if self.no_cache: logging.info(f'Preprocessing tfidf: {tfidf_params}..') with __silent__(): preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params)) @@ -166,7 +168,8 @@ def get_dataset(self, params): def get_fold_data(self, dataset, params): fold = params.fold - if params.tfidf != self._cached_params.tfidf or fold != self._cached_params.fold: + self.no_cache |= (fold != self._cached_params.fold) + if self.no_cache: logging.info(f'Preprocessing fold: {fold} for tfidf: {params.tfidf}..') self._cached_params.fold = fold self._cached_fold_data = ( @@ -178,8 +181,8 @@ def get_fold_data(self, dataset, params): def get_tree_root(self, y, x, params): tree_params = params.tree - if params.tfidf != self._cached_params.tfidf or tree_params != self._cached_params.tree or \ - params.fold != self._cached_params.fold: + self.no_cache |= (tree_params != self._cached_params.tree) + if self.no_cache: logging.info(f'Preprocessing tree: {tree_params} on fold {params.fold} for tfidf: {params.tfidf}..') with __silent__(): label_representation = (y.T * x).tocsr() @@ -204,24 +207,26 @@ def get_model(self, y, x, params): """ logging.info(f'\nRunning fold {params.fold}\nparams: {params}') + root = self.get_tree_root(y, x, params) + linear_params = params.linear - if params.tfidf != self._cached_params.tfidf or params.tree != self._cached_params.tree or \ - linear_params != self._cached_params.linear or params.fold != self._cached_params.fold: + self.no_cache |= (linear_params != self._cached_params.linear) + if self.no_cache: logging.info(f'Preprocessing linear: {linear_params}, tree: {params.tree} on fold {params.fold} for tfidf: {params.tfidf}..') - root = self.get_tree_root(y, x, params) - self._cached_params.linear = linear_params - self._cached_model = linear.train_tree(y, x, root=root, options=params.linear_options) + with __silent__(): + self._cached_params.linear = linear_params + self._cached_model = linear.train_tree(y, x, root=root, options=params.linear_options) return self._cached_model - def get_cv_score(self, y, x, model, params): + def get_cv_score(self, y, x, model, params, metrics): logging.info(f'Scoring params: {params.predict}') batch_size = 256 num_instances = x.shape[0] num_batches = math.ceil(num_instances / batch_size) - metrics = linear.get_metrics(self.metrics, num_classes=y.shape[1]) + # metrics = linear.get_metrics(self.metrics, num_classes=y.shape[1]) for i in range(num_batches): preds = model.predict_values( @@ -230,6 +235,7 @@ def get_cv_score(self, y, x, model, params): target = y[i * batch_size : (i + 1) * batch_size].toarray() metrics.update(preds, target) + # return metrics scores = metrics.compute() logging.info(f'cv_score: {scores}\n') @@ -244,18 +250,20 @@ def __call__(self, search_space): self.build_fold_idx() self.results = { - GridParameter(params): {metric: 0 for metric in self.metrics} for params in search_space + GridParameter(params): linear.get_metrics(self.metrics, num_classes=y.shape[1]) for params in search_space } for params in self.search_space: + # for fold in self.n_folds: dataset = self.get_dataset(params) y_train_fold, x_train_fold, y_valid_fold, x_valid_fold = \ self.get_fold_data(dataset, params) model = self.get_model(y_train_fold, x_train_fold, params) - cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, model, params) - params.fold = -1 - for metric in self.metrics: - self.results[params][metric] += cv_score[metric] / self.n_folds + cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, model, params, self.results[params]) + + # params.fold = -1 + # for metric in self.metrics: + # # self.results[params][metric] += cv_score[metric] / self.n_folds return self.output() From f4a44d8e08fb834b8acc474dd751f2e81e25e317 Mon Sep 17 00:00:00 2001 From: chcwww Date: Thu, 22 Jan 2026 17:54:59 +0000 Subject: [PATCH 13/23] rewrite fold and tfidf for correctness --- grid.py | 207 ++++++++++++++++++++++++++--------------------------- run_exp.py | 55 +++----------- 2 files changed, 113 insertions(+), 149 deletions(-) diff --git a/grid.py b/grid.py index 937bb68c..c25d7940 100644 --- a/grid.py +++ b/grid.py @@ -3,6 +3,7 @@ import os import sys +import itertools import logging import libmultilabel.linear as linear @@ -22,7 +23,7 @@ def __init__(self): def __enter__(self): os.dup2(self.devnull, 2) self.stdout = sys.stdout - sys.stdout = open(os.devnull, 'w') + sys.stdout = open(os.devnull, "w") def __exit__(self, type, value, traceback): os.dup2(self.stderr, 2) @@ -35,34 +36,37 @@ def __exit__(self, type, value, traceback): class GridParameter: _tfidf_fields = [ - ('ngram_range', tuple[int, int], field(default=(1, 1))), - ('max_features', int, field(default=None)), - ('min_df', float | int, field(default=1)), - ('stop_words', str | list, field(default=None)), - ('strip_accents', str | Callable, field(default=None)), - ('tokenizer', Callable, field(default=None)), + ("ngram_range", tuple[int, int], field(default=(1, 1))), + ("max_features", int, field(default=None)), + ("min_df", float | int, field(default=1)), + ("stop_words", str | list, field(default=None)), + ("strip_accents", str | Callable, field(default=None)), + ("tokenizer", Callable, field(default=None)), ] _tree_fields = [ - ('dmax', int, field(default=10)), - ('K', int, field(default=8)), + ("dmax", int, field(default=10)), + ("K", int, field(default=8)), ] _linear_fields = [ - ('s', int, field(default=1)), - ('c', float, field(default=1)), - ('B', int, field(default=-1)), - # ('alpha', float, field(default=1)), + ("s", int, field(default=1)), + ("c", float, field(default=1)), + ("B", int, field(default=-1)), + # ("alpha", float, field(default=1)), ] _predict_fields = [ - ('beam_width', int, field(default=10)), - ('A', int, field(default=1)), + ("beam_width", int, field(default=10)), + ("A", int, field(default=1)), ] param_types = { - 'tfidf': make_dataclass('TfidfParams', _tfidf_fields, frozen=True, order=True), - 'fold': lambda fold: fold, - 'tree': make_dataclass('TreeParams', _tree_fields, frozen=True, order=True), - 'linear': make_dataclass('LinearParams', _linear_fields, frozen=True, order=True), - 'predict': make_dataclass('PredictParams', _predict_fields, frozen=True, order=True), + "tfidf": make_dataclass("TfidfParams", _tfidf_fields, frozen=True, order=True), + "tree": make_dataclass("TreeParams", _tree_fields, frozen=True, order=True), + "linear": make_dataclass("LinearParams", _linear_fields, frozen=True, order=True), + "predict": make_dataclass("PredictParams", _predict_fields, frozen=True, order=True), + } + _param_field_names = { + param_type: {f.name for f in fields(class_name)} + for param_type, class_name in param_types.items() } def __init__(self, params: dict | None = None, fold: int = -1): @@ -70,18 +74,16 @@ def __init__(self, params: dict | None = None, fold: int = -1): params_set = set(self.params) for param_type, class_name in self.param_types.items(): - if param_type == 'fold': - filtered_params = {'fold': fold} - else: - field_names = {f.name for f in fields(class_name)} - filtered_keys = params_set & field_names - params_set -= field_names - filtered_params = {k: self.params[k] for k in filtered_keys} + field_names = self._param_field_names[param_type] + filtered_keys = params_set & field_names + params_set -= field_names + + filtered_params = {k: self.params[k] for k in filtered_keys} setattr(self, param_type, class_name(**filtered_params)) @property def linear_options(self): - options = '' + options = "" for f in fields(self.linear): options += f" -{f.name} {getattr(self.linear, f.name)}" return options.strip() @@ -107,16 +109,17 @@ def __init__( self, datasets: dict[str, np.matrix], n_folds: int = 3, - metrics: list[str] = ["P@1", "P@3", "P@5"], + monitor_metrics: list[str] = ["P@1", "P@3", "P@5"], ): self.datasets = datasets self.n_folds = n_folds - self.metrics = metrics + self.monitor_metrics = monitor_metrics + self.param_metrics = dict() self._cached_params = GridParameter() for param_type in self._cached_params.param_types: setattr(self._cached_params, param_type, None) - self._cached_tfidf_data = None + self._cached_transformed_dataset = None self._cached_tree_root = None self._cached_fold_data = None self._cached_model = None @@ -124,66 +127,53 @@ def __init__( self.num_instances = len(self.datasets["train"]["y"]) - # def init_tfidf_cache(self, datasets, params): - # self._cached_tfidf_params = params.tfidf - # self._cached_tfidf_data = datasets - - def sort_search_space(self): - self.search_space.sort() - - def build_fold_idx(self): - permutation = np.random.permutation(self.num_instances) - index_per_fold = [ - permutation[int(fold * self.num_instances / self.n_folds):int((fold+1) * self.num_instances / self.n_folds)] - for fold in range(self.n_folds) - ] - - self.fold_idx = { - fold: { - 'train': np.concatenate(index_per_fold[:fold] + index_per_fold[fold+1:]), - 'valid': index_per_fold[fold] - } for fold in range(self.n_folds) + def get_fold_dataset(self, train_idx, valid_idx): + def take(data, idx): + if isinstance(data, list): + return [data[i] for i in idx] + else: + return data[idx] + + return { + "data_format": self.datasets["data_format"], + "train": { + "y": take(self.datasets["train"]["y"], train_idx), + "x": take(self.datasets["train"]["x"], train_idx) + }, + "test": { + "y": take(self.datasets["train"]["y"], valid_idx), + "x": take(self.datasets["train"]["x"], valid_idx) } + } - def get_dataset(self, params): + def get_transformed_dataset(self, dataset, params): """ - Get the dataset for the given params. + Get the dataset for the given tf-idf params. Args: params (GridParameter): The params to build the dataset. Returns: - dict[str, np.matrix]: The keys should be 'y' and 'x'. + dict[str, np.matrix]: The keys should be "y" and "x". """ tfidf_params = params.tfidf self.no_cache = (tfidf_params != self._cached_params.tfidf) if self.no_cache: - logging.info(f'Preprocessing tfidf: {tfidf_params}..') + logging.info(f"Preprocessing tfidf: {tfidf_params}") + if self.datasets["data_format"] not in {"txt", "dataframe"}: + logging.info('The TF-IDF parameters are only meaningful for the “txt” and “dataframe” data formats.') with __silent__(): preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params)) self._cached_params.tfidf = tfidf_params - self._cached_tfidf_data = preprocessor.fit_transform(self.datasets)['train'] + self._cached_transformed_dataset = preprocessor.fit_transform(dataset) - return self._cached_tfidf_data - - def get_fold_data(self, dataset, params): - fold = params.fold - self.no_cache |= (fold != self._cached_params.fold) - if self.no_cache: - logging.info(f'Preprocessing fold: {fold} for tfidf: {params.tfidf}..') - self._cached_params.fold = fold - self._cached_fold_data = ( - dataset["y"][self.fold_idx[fold]['train']], dataset["x"][self.fold_idx[fold]['train']], - dataset["y"][self.fold_idx[fold]['valid']], dataset["x"][self.fold_idx[fold]['valid']] - ) - - return self._cached_fold_data + return self._cached_transformed_dataset def get_tree_root(self, y, x, params): tree_params = params.tree self.no_cache |= (tree_params != self._cached_params.tree) if self.no_cache: - logging.info(f'Preprocessing tree: {tree_params} on fold {params.fold} for tfidf: {params.tfidf}..') + logging.info(f"Preprocessing tree: {tree_params}") with __silent__(): label_representation = (y.T * x).tocsr() label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) @@ -205,65 +195,72 @@ def get_model(self, y, x, params): Returns: linear.FlatModel | linear.TreeModel: The model for the given params. """ - logging.info(f'\nRunning fold {params.fold}\nparams: {params}') - root = self.get_tree_root(y, x, params) linear_params = params.linear self.no_cache |= (linear_params != self._cached_params.linear) if self.no_cache: - logging.info(f'Preprocessing linear: {linear_params}, tree: {params.tree} on fold {params.fold} for tfidf: {params.tfidf}..') + logging.info(f"Training: {linear_params}") with __silent__(): self._cached_params.linear = linear_params self._cached_model = linear.train_tree(y, x, root=root, options=params.linear_options) return self._cached_model - def get_cv_score(self, y, x, model, params, metrics): - logging.info(f'Scoring params: {params.predict}') + def compute_scores(self, y, x, model, params): + logging.info(f"Scoring: {params.predict}") batch_size = 256 num_instances = x.shape[0] num_batches = math.ceil(num_instances / batch_size) - # metrics = linear.get_metrics(self.metrics, num_classes=y.shape[1]) + if params not in self.param_metrics.keys(): + self.param_metrics[params] = linear.get_metrics(self.monitor_metrics, num_classes=y.shape[1]) for i in range(num_batches): preds = model.predict_values( x[i * batch_size : (i + 1) * batch_size], **asdict(params.predict)) target = y[i * batch_size : (i + 1) * batch_size].toarray() - metrics.update(preds, target) - - # return metrics - scores = metrics.compute() - logging.info(f'cv_score: {scores}\n') - - return scores - - def output(self): - return sorted(self.results.items(), key=lambda x: x[1][self.metrics[0]], reverse=True) + self.param_metrics[params].update(preds, target) - def __call__(self, search_space): - self.search_space = [GridParameter(params, fold) for params in search_space for fold in range(self.n_folds)] - self.sort_search_space() - self.build_fold_idx() + logging.info(f"cv_score: {self.param_metrics[params].compute()}\n") - self.results = { - GridParameter(params): linear.get_metrics(self.metrics, num_classes=y.shape[1]) for params in search_space - } - for params in self.search_space: - # for fold in self.n_folds: - dataset = self.get_dataset(params) - y_train_fold, x_train_fold, y_valid_fold, x_valid_fold = \ - self.get_fold_data(dataset, params) + def __call__(self, search_space_dict: dict[str, list]) -> dict[GridParameter, dict[str, float]]: + self.param_metrics.clear() - model = self.get_model(y_train_fold, x_train_fold, params) - params.fold = -1 - cv_score = self.get_cv_score(y_valid_fold, x_valid_fold, model, params, self.results[params]) + param_names = search_space_dict.keys() + self.search_space = sorted([ + GridParameter(dict(zip(param_names, param_values))) + for param_values in itertools.product(*search_space_dict.values()) + ]) - # params.fold = -1 - # for metric in self.metrics: - # # self.results[params][metric] += cv_score[metric] / self.n_folds + permutation = np.random.permutation(self.num_instances) + index_per_fold = [ + permutation[int(fold * self.num_instances / self.n_folds):int((fold+1) * self.num_instances / self.n_folds)] + for fold in range(self.n_folds) + ] - return self.output() + for fold in range(self.n_folds): + train_idx = np.concatenate(index_per_fold[:fold] + index_per_fold[fold+1:]) + valid_idx = index_per_fold[fold] + fold_dataset = self.get_fold_dataset(train_idx, valid_idx) + + self._cached_params.tfidf = None + for params in self.search_space: + logging.info(f"Running fold {fold}, params: {params}") + + transformed_dataset = self.get_transformed_dataset(fold_dataset, params) + model = self.get_model( + transformed_dataset["train"]["y"], + transformed_dataset["train"]["x"], + params + ) + self.compute_scores( + transformed_dataset["test"]["y"], + transformed_dataset["test"]["x"], + model, + params + ) + + return {params: metrics.compute() for params, metrics in self.param_metrics.items()} diff --git a/run_exp.py b/run_exp.py index b359b8a7..88d376bf 100644 --- a/run_exp.py +++ b/run_exp.py @@ -23,22 +23,11 @@ def prune_model(*args, **kwargs): parser = argparse.ArgumentParser(description="Parse command-line arguments.") parser.add_argument("--dataset", type=str, default="EUR-Lex", help="Dataset name (e.g., AmazonCat-13K, EUR-Lex)") + parser.add_argument("--data_format", type=str, default="txt", help="Data format.") args = parser.parse_args() - dataset_ = args.dataset + dataset = linear.load_dataset(args.data_format, f"data/{args.dataset}/train.{args.data_format}") # , f"data/{dataset}/test.{args.data_format}" - datasets = linear.load_dataset("svm", f"data/{dataset_}/train.svm") # , f"data/{dataset}/test.svm" - # data_source = [f'data/{dataset_}/train.svm', f'data/{dataset_}/test.svm'] - # search_space = { - # 'tfidf': { - # 'min_df': [1, 2], - # 'max_features': [10000, 320000], - # }, - # 'params': { - # 'C': [1, 2], - # 'K': [2, 100], - # }, - # } n_folds = 3 retrain = False linear_technique = 'tree' @@ -49,42 +38,20 @@ def prune_model(*args, **kwargs): 'A': [2, 3], 'c': [0.1, 0.2], } - param_names = search_space_dict.keys() - search_space = [ - dict(zip(param_names, param_values)) - for param_values in itertools.product(*search_space_dict.values()) - ] - # search_space = [dict()] # all default values - # search_space = [ - # {'max_features': i, 'K': j, 'min_df': k, 'c': l} - # for i in [10000, 20000] for j in [10, 100] for k in [1, 2] for l in [0.1, 0.2] - # ] + # for i in search_space: + # print(i) - for i in search_space: - print(i) - - search = grid.GridSearch(datasets, n_folds) - best_params = search(search_space) - print(best_params) + search = grid.GridSearch(dataset, n_folds) + scores = search(search_space_dict) + print(scores) breakpoint() - # if best_params.tfidf == search._cached_tfidf_params: - # datasets = search._cached_tfidf_data - # else: - # preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf)) - # datasets = preprocessor.fit_transform(datasets) - # search.init_tfidf_cache(datasets, best_params) - - # best_alpha = search(['alpha'])[0] - # best_A = search(['A'])[0] - # # TODO (the fields are frozen) - # best_params.linear.alpha = best_alpha - # best_params.linear.A = best_A - if retrain: + # TODO + best_params = None model = linear.LINEAR_TECHNIQUES[linear_technique]( - datasets["train"]["y"], - datasets["train"]["x"], + dataset["train"]["y"], + dataset["train"]["x"], **asdict(best_params.linear), ) From 4942b805f749546aa62aa95d755818df4baf494f Mon Sep 17 00:00:00 2001 From: chcwww Date: Fri, 23 Jan 2026 06:06:17 +0000 Subject: [PATCH 14/23] update retrain examplef --- run_exp.py | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/run_exp.py b/run_exp.py index 88d376bf..b38fccf9 100644 --- a/run_exp.py +++ b/run_exp.py @@ -4,11 +4,6 @@ import numpy as np from dataclasses import asdict -import time -import json -from tqdm import tqdm -import itertools - def prune_model(*args, **kwargs): pass @@ -19,7 +14,7 @@ def prune_model(*args, **kwargs): import logging logging.basicConfig(level=logging.INFO) - np.random.seed(20250820) + np.random.seed(20260123) parser = argparse.ArgumentParser(description="Parse command-line arguments.") parser.add_argument("--dataset", type=str, default="EUR-Lex", help="Dataset name (e.g., AmazonCat-13K, EUR-Lex)") @@ -28,9 +23,9 @@ def prune_model(*args, **kwargs): dataset = linear.load_dataset(args.data_format, f"data/{args.dataset}/train.{args.data_format}") # , f"data/{dataset}/test.{args.data_format}" + retrain = True n_folds = 3 - retrain = False - linear_technique = 'tree' + monitor_metrics = ["P@1", "P@3", "P@5"] search_space_dict = { 'max_features': [10000, 20000], 'K': [10, 100], @@ -39,19 +34,21 @@ def prune_model(*args, **kwargs): 'c': [0.1, 0.2], } - # for i in search_space: - # print(i) - - search = grid.GridSearch(dataset, n_folds) - scores = search(search_space_dict) - print(scores) - breakpoint() + search = grid.GridSearch(dataset, n_folds, monitor_metrics) + cv_scores = search(search_space_dict) + sorted_cv_scores = sorted(cv_scores.items(), key=lambda x: x[1][monitor_metrics[0]], reverse=True) + print(sorted_cv_scores) if retrain: - # TODO - best_params = None - model = linear.LINEAR_TECHNIQUES[linear_technique]( - dataset["train"]["y"], - dataset["train"]["x"], - **asdict(best_params.linear), + # TODO: test set + best_params, best_cv_scores = list(sorted_cv_scores)[0] + print(best_params, best_cv_scores) + + preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf)) + transformed_dataset = preprocessor.fit_transform(dataset) + model = linear.train_tree( + transformed_dataset["train"]["y"], + transformed_dataset["train"]["x"], + best_params.linear_options, + **asdict(best_params.tree), ) From 5815e3ec680ca48249b1be8b61b4de644d7f555a Mon Sep 17 00:00:00 2001 From: chcwww Date: Fri, 23 Jan 2026 06:07:21 +0000 Subject: [PATCH 15/23] make the logging info prettier --- grid.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/grid.py b/grid.py index c25d7940..5593644e 100644 --- a/grid.py +++ b/grid.py @@ -159,13 +159,15 @@ def get_transformed_dataset(self, dataset, params): tfidf_params = params.tfidf self.no_cache = (tfidf_params != self._cached_params.tfidf) if self.no_cache: - logging.info(f"Preprocessing tfidf: {tfidf_params}") + logging.info(f"TFIDF - Preprocessing: {tfidf_params}") if self.datasets["data_format"] not in {"txt", "dataframe"}: logging.info('The TF-IDF parameters are only meaningful for the “txt” and “dataframe” data formats.') with __silent__(): preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params)) self._cached_params.tfidf = tfidf_params self._cached_transformed_dataset = preprocessor.fit_transform(dataset) + else: + logging.info(f"TFIDF - Using cached data: {tfidf_params}") return self._cached_transformed_dataset @@ -173,13 +175,15 @@ def get_tree_root(self, y, x, params): tree_params = params.tree self.no_cache |= (tree_params != self._cached_params.tree) if self.no_cache: - logging.info(f"Preprocessing tree: {tree_params}") + logging.info(f"Tree - Preprocessing: {tree_params}") with __silent__(): label_representation = (y.T * x).tocsr() label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) self._cached_params.tree = tree_params self._cached_tree_root = _build_tree(label_representation, np.arange(y.shape[1]), 0, **asdict(tree_params)) self._cached_tree_root.is_root = True + else: + logging.info(f"Tree - Using cached data: {tree_params}") return self._cached_tree_root @@ -200,15 +204,17 @@ def get_model(self, y, x, params): linear_params = params.linear self.no_cache |= (linear_params != self._cached_params.linear) if self.no_cache: - logging.info(f"Training: {linear_params}") + logging.info(f"Model - Training: {linear_params}") with __silent__(): self._cached_params.linear = linear_params self._cached_model = linear.train_tree(y, x, root=root, options=params.linear_options) + else: + logging.info(f"Model - Using cached data: {linear_params}") return self._cached_model def compute_scores(self, y, x, model, params): - logging.info(f"Scoring: {params.predict}") + logging.info(f"Metric - Scoring: {params.predict}\n") batch_size = 256 num_instances = x.shape[0] @@ -224,8 +230,6 @@ def compute_scores(self, y, x, model, params): target = y[i * batch_size : (i + 1) * batch_size].toarray() self.param_metrics[params].update(preds, target) - logging.info(f"cv_score: {self.param_metrics[params].compute()}\n") - def __call__(self, search_space_dict: dict[str, list]) -> dict[GridParameter, dict[str, float]]: self.param_metrics.clear() @@ -248,7 +252,7 @@ def __call__(self, search_space_dict: dict[str, list]) -> dict[GridParameter, di self._cached_params.tfidf = None for params in self.search_space: - logging.info(f"Running fold {fold}, params: {params}") + logging.info(f"Status - Running fold {fold}, params: {params}") transformed_dataset = self.get_transformed_dataset(fold_dataset, params) model = self.get_model( From 1e284cecd77e90eff7cab1cd4cb1f5bd808a013a Mon Sep 17 00:00:00 2001 From: chcwww Date: Tue, 27 Jan 2026 16:44:40 +0000 Subject: [PATCH 16/23] update weights pruning - the implementation is based on code from Zhi-Bao's repo. --- libmultilabel/linear/linear.py | 41 ++++++++++++++++++++++++++++++++-- libmultilabel/linear/tree.py | 8 +++++-- linear_trainer.py | 1 + main.py | 6 +++++ 4 files changed, 52 insertions(+), 4 deletions(-) diff --git a/libmultilabel/linear/linear.py b/libmultilabel/linear/linear.py index 04d25a21..2e7d6611 100644 --- a/libmultilabel/linear/linear.py +++ b/libmultilabel/linear/linear.py @@ -92,6 +92,37 @@ def _to_dense_array(self, matrix: np.matrix | sparse.csr_matrix) -> np.ndarray: return np.asarray(matrix) +def _pruning_weights(weights: np.ndarray, pruning_alpha: float) -> np.ndarray: + """Prune the weights of the linear model. + + Args: + weights (np.ndarray): Linear model weights. + pruning_alpha (float): Fraction of weights to keep after pruning. + + Returns: + np.ndarray: The pruned weights. + """ + pruning_ratio = 1-pruning_alpha + + if 0 >= pruning_ratio: + return weights + elif pruning_ratio >= 1: + return np.zeros_like(weights) + else: + # Perform pruning algorithm + # Reduce the number of nonzero features per column by a factor of pruning_ratio. + nonzero_indices = np.flatnonzero(weights) + num_nonzeros = nonzero_indices.size + # Threshold + k = np.clip(int(pruning_ratio * num_nonzeros), 0, num_nonzeros) + k_nonzero_indices = np.argpartition(np.abs(weights[nonzero_indices]), kth=k-1)[:k] + + pruned_indices = nonzero_indices[k_nonzero_indices] + weights[pruned_indices] = 0 + + return weights + + class ParallelOVRTrainer(threading.Thread): """A trainer for parallel 1vsrest training.""" @@ -103,6 +134,7 @@ class ParallelOVRTrainer(threading.Thread): weights: np.ndarray pbar: tqdm queue: queue.SimpleQueue + pruning_alpha: float def __init__(self): threading.Thread.__init__(self) @@ -114,6 +146,7 @@ def init_trainer( x: sparse.csr_matrix, options: str, verbose: bool, + pruning_alpha: float, ): """Initialize the parallel trainer by setting y, x, parameter and threading related variables as class variables of ParallelOVRTrainer. @@ -123,11 +156,13 @@ def init_trainer( x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. options (str): The option string passed to liblinear. verbose (bool): Output extra progress information. + pruning_alpha (float): Fraction of weights to keep after pruning. """ x, options, bias = _prepare_options(x, options) cls.y = y.tocsc() cls.x = x cls.bias = bias + cls.pruning_alpha = pruning_alpha num_instances, num_classes = cls.y.shape num_features = cls.x.shape[1] cls.prob = problem(np.ones((num_instances,)), cls.x) @@ -187,7 +222,7 @@ def run(self): except queue.Empty: break yi = self.y[:, label_idx].toarray().reshape(-1) - self.weights[:, label_idx] = self._do_parallel_train(2 * yi - 1).ravel() + self.weights[:, label_idx] = _pruning_weights(self._do_parallel_train(2 * yi - 1).ravel(), self.pruning_alpha) self.pbar.update() @@ -198,6 +233,7 @@ def train_1vsrest( multiclass: bool = False, options: str = "", verbose: bool = True, + pruning_alpha: float = 1.0, ) -> FlatModel: """Train a linear model parallel on labels for multi-label data using a one-vs-rest strategy. @@ -207,12 +243,13 @@ def train_1vsrest( multiclass (bool, optional): A flag indicating if the dataset is multiclass. options (str, optional): The option string passed to liblinear. Defaults to ''. verbose (bool, optional): Output extra progress information. Defaults to True. + pruning_alpha (float, optional): Fraction of weights to keep after pruning. Defaults to 1.0 (no pruning). Returns: A model which can be used in predict_values. """ # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/ - ParallelOVRTrainer.init_trainer(y, x, options, verbose) + ParallelOVRTrainer.init_trainer(y, x, options, verbose, pruning_alpha) num_threads = psutil.cpu_count(logical=False) trainers = [ParallelOVRTrainer() for _ in range(num_threads)] for trainer in trainers: diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 87e7242e..485540df 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -211,6 +211,7 @@ def train_tree( dmax=DEFAULT_DMAX, verbose: bool = True, root: Node = None, + pruning_alpha: float = 1, ) -> TreeModel: """Train a linear model for multi-label data using a divide-and-conquer strategy. The algorithm used is based on https://github.com/xmc-aalto/bonsai. @@ -222,6 +223,8 @@ def train_tree( K (int, optional): Maximum degree of nodes in the tree. Defaults to 100. dmax (int, optional): Maximum depth of the tree. Defaults to 10. verbose (bool, optional): Output extra progress information. Defaults to True. + root (Node, optional): Pre-built tree root. Defaults to None. + pruning_alpha (float optional): Fraction of weights to keep after pruning. Defaults to 1.0 (no pruning). Returns: TreeModel: A model which can be used in predict_values. @@ -242,6 +245,7 @@ def count(node): nonlocal num_nodes num_nodes += 1 node.num_features_used = np.count_nonzero(features_used_perlabel[:, node.label_map].sum(axis=1)) + node.pruning_alpha = pruning_alpha root.dfs(count) @@ -344,14 +348,14 @@ def _train_node(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str, node: node (Node): Node to be trained. """ if node.isLeaf(): - node.model = linear.train_1vsrest(y[:, node.label_map], x, False, options, False) + node.model = linear.train_1vsrest(y[:, node.label_map], x, False, options, False, node.pruning_alpha) else: # meta_y[i, j] is 1 if the ith instance is relevant to the jth child. # getnnz returns an ndarray of shape number of instances. # This must be reshaped into number of instances * 1 to be interpreted as a column. meta_y = [y[:, child.label_map].getnnz(axis=1)[:, np.newaxis] > 0 for child in node.children] meta_y = sparse.csr_matrix(np.hstack(meta_y)) - node.model = linear.train_1vsrest(meta_y, x, False, options, False) + node.model = linear.train_1vsrest(meta_y, x, False, options, False, node.pruning_alpha) node.model.weights = sparse.csc_matrix(node.model.weights) diff --git a/linear_trainer.py b/linear_trainer.py index b9133857..d84991f3 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -66,6 +66,7 @@ def linear_train(datasets, config): options=config.liblinear_options, K=config.tree_degree, dmax=config.tree_max_depth, + pruning_alpha=config.pruning_alpha, ) else: model = LINEAR_TECHNIQUES[config.linear_technique]( diff --git a/main.py b/main.py index 70907edf..47bbea92 100644 --- a/main.py +++ b/main.py @@ -217,6 +217,12 @@ def add_all_arguments(parser): action="store_true", help="Save all the predictions with decision value larger then 0. If used, the save_k_predictions must be set to 0", ) + parser.add_argument( + "--pruning_alpha", + type=float, + default=1.0, + help="Fraction of weights to keep after pruning (1.0 means no pruning)." + ) # tree options parser.add_argument("--tree_degree", type=int, default=100, help="Degree of the tree (default: %(default)s)") From 487e20eb0362c75a3c2995c7a011581c3320d905 Mon Sep 17 00:00:00 2001 From: chcwww Date: Tue, 27 Jan 2026 17:06:39 +0000 Subject: [PATCH 17/23] update prob estimation - the implementation is based on code from Guan-Ting's PR. --- grid.py | 2 +- libmultilabel/linear/tree.py | 36 ++++++++++++++++++++++++------------ linear_trainer.py | 1 + main.py | 8 +++++++- run_exp.py | 2 +- 5 files changed, 34 insertions(+), 15 deletions(-) diff --git a/grid.py b/grid.py index 5593644e..d6a1b2f9 100644 --- a/grid.py +++ b/grid.py @@ -55,7 +55,7 @@ class GridParameter: ] _predict_fields = [ ("beam_width", int, field(default=10)), - ("A", int, field(default=1)), + ("prob_A", int, field(default=1)), ] param_types = { diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 485540df..b5c2ca20 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -59,22 +59,35 @@ def __init__( self.multiclass = False self._model_separated = False # Indicates whether the model has been separated for pruning tree. + def sigmoid_A(self, x: np.ndarray, prob_A: int): + """ + Calculate log(sigmoid(prob_A * x)). + + Args: + x (np.ndarray): A matrix with dimension number of instances * number of classes. + prob_A (int): The tunable parameter of probability estimation function, that is sigmoid(prob_A * preds). + + Returns: + np.ndarray: A matrix with dimension number of instances * number of classes. + """ + return log_expit(prob_A * x) + def predict_values( self, x: sparse.csr_matrix, beam_width: int = 10, - A: int = 3, + prob_A: int = 3, ) -> np.ndarray: """Calculate the probability estimates associated with x. Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int, optional): Number of candidates considered during beam search. Defaults to 10. + prob_A (int, optional): The tunable parameter of probability estimation function, that is sigmoid(prob_A * preds). Defaults to 3. Returns: np.ndarray: A matrix with dimension number of instances * number of classes. """ - sigmoid_A = lambda x: log_expit(A * x) if beam_width >= len(self.root.children): # Beam_width is sufficiently large; pruning not applied. # Calculates decision values for all nodes. @@ -84,8 +97,8 @@ def predict_values( if not self._model_separated: self._separate_model_for_pruning_tree() self._model_separated = True - all_preds = self._prune_tree_and_predict_values(x, beam_width, sigmoid_A) # number of instances * (number of labels + total number of metalabels) - return np.vstack([self._beam_search(all_preds[i], beam_width, sigmoid_A) for i in range(all_preds.shape[0])]) + all_preds = self._prune_tree_and_predict_values(x, beam_width, prob_A) # number of instances * (number of labels + total number of metalabels) + return np.vstack([self._beam_search(all_preds[i], beam_width, prob_A) for i in range(all_preds.shape[0])]) def _separate_model_for_pruning_tree(self): """ @@ -116,7 +129,7 @@ def _separate_model_for_pruning_tree(self): ) self.subtree_models.append(subtree_flatmodel) - def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, sigmoid_A: Callable) -> np.ndarray: + def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, prob_A: int) -> np.ndarray: """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees. Only subtrees corresponding to the top beam_width candidates from the root are evaluated, @@ -125,6 +138,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int): Number of top candidate branches considered for prediction. + prob_A (int, optional): The tunable parameter of probability estimation function, that is sigmoid(prob_A * preds). Returns: np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels). @@ -135,8 +149,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, # Calculate root decision values and scores root_preds = linear.predict_values(self.root_model, x) - # children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds)) - children_scores = 0.0 + sigmoid_A(root_preds) + children_scores = 0.0 + self.sigmoid_A(root_preds, prob_A) slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] all_preds[slice] = root_preds @@ -163,12 +176,13 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, return all_preds - def _beam_search(self, instance_preds: np.ndarray, beam_width: int, sigmoid_A: Callable) -> np.ndarray: + def _beam_search(self, instance_preds: np.ndarray, beam_width: int, prob_A: int) -> np.ndarray: """Predict with beam search using cached probability estimates for a single instance. Args: instance_preds (np.ndarray): A vector of cached probability estimates of each node, has dimension number of labels + total number of metalabels. beam_width (int): Number of candidates considered. + prob_A (int, optional): The tunable parameter of probability estimation function, that is sigmoid(prob_A * preds). Returns: np.ndarray: A vector with dimension number of classes. @@ -186,8 +200,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int, sigmoid_A: C continue slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - # children_score = score - np.square(np.maximum(0, 1 - pred)) - children_score = score + sigmoid_A(pred) + children_score = score + self.sigmoid_A(pred, prob_A) next_level.extend(zip(node.children, children_score.tolist())) cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width] @@ -198,8 +211,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int, sigmoid_A: C for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - # scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred))) - scores[node.label_map] = np.exp(score + sigmoid_A(pred)) + scores[node.label_map] = np.exp(score + self.sigmoid_A(pred, prob_A)) return scores diff --git a/linear_trainer.py b/linear_trainer.py index d84991f3..1ff18584 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -24,6 +24,7 @@ def linear_test(config, model, datasets, label_mapping): predict_kwargs = {} if isinstance(model, (TreeModel, EnsembleTreeModel)): predict_kwargs["beam_width"] = config.beam_width + predict_kwargs["prob_A"] = config.prob_A for i in tqdm(range(ceil(num_instance / config.eval_batch_size))): slice = np.s_[i * config.eval_batch_size : (i + 1) * config.eval_batch_size] diff --git a/main.py b/main.py index 47bbea92..490c633b 100644 --- a/main.py +++ b/main.py @@ -221,7 +221,7 @@ def add_all_arguments(parser): "--pruning_alpha", type=float, default=1.0, - help="Fraction of weights to keep after pruning (1.0 means no pruning)." + help="Fraction of weights to keep after pruning (1.0 means no pruning, default: %(default)s)" ) # tree options @@ -238,6 +238,12 @@ def add_all_arguments(parser): default=10, help="The width of the beam search (default: %(default)s)", ) + parser.add_argument( + "--prob_A", + type=int, + default=3, + help="The tunable parameter of probability estimation function, that is sigmoid(prob_A * preds) (default: %(default)s)", + ) # AttentionXML parser.add_argument( "--cluster_size", diff --git a/run_exp.py b/run_exp.py index b38fccf9..b277d3c5 100644 --- a/run_exp.py +++ b/run_exp.py @@ -30,7 +30,7 @@ def prune_model(*args, **kwargs): 'max_features': [10000, 20000], 'K': [10, 100], 'min_df': [1, 2], - 'A': [2, 3], + 'prob_A': [2, 3], 'c': [0.1, 0.2], } From d525c8568993bf21772d058ed83158012120faa0 Mon Sep 17 00:00:00 2001 From: chcwww Date: Tue, 27 Jan 2026 19:21:10 +0000 Subject: [PATCH 18/23] update grid search for pruning_alpha --- grid.py | 65 +++++++++++++++++++++++++++------- libmultilabel/linear/linear.py | 5 ++- libmultilabel/linear/tree.py | 11 +++--- run_exp.py | 1 + 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/grid.py b/grid.py index d6a1b2f9..6862b494 100644 --- a/grid.py +++ b/grid.py @@ -7,10 +7,12 @@ import logging import libmultilabel.linear as linear -from libmultilabel.linear.tree import _build_tree +from libmultilabel.linear.tree import TreeModel, _build_tree +from libmultilabel.linear.linear import _pruning_weights import sklearn.preprocessing import numpy as np +import scipy.sparse as sparse import math @@ -51,11 +53,11 @@ class GridParameter: ("s", int, field(default=1)), ("c", float, field(default=1)), ("B", int, field(default=-1)), - # ("alpha", float, field(default=1)), + ("pruning_alpha", float, field(default=1)), ] _predict_fields = [ ("beam_width", int, field(default=10)), - ("prob_A", int, field(default=1)), + ("prob_A", int, field(default=3)), ] param_types = { @@ -84,8 +86,9 @@ def __init__(self, params: dict | None = None, fold: int = -1): @property def linear_options(self): options = "" - for f in fields(self.linear): - options += f" -{f.name} {getattr(self.linear, f.name)}" + linear_field_names = (self._param_field_names['linear'] - {'pruning_alpha'}) + for field_name in linear_field_names: + options += f" -{field_name} {getattr(self.linear, field_name)}" return options.strip() def __repr__(self): @@ -104,6 +107,27 @@ def __hash__(self): return hash(tuple(getattr(self, t) for t in self.param_types)) +def pruning_flat_model(flat_model: linear.FlatModel, pruning_alpha: float) -> np.ndarray: + """Prune the weights of the flat model. + + Args: + flat_model (linear.FlatModel): The flat model. + pruning_alpha (float): Fraction of weights to keep after pruning. + + Returns: + np.ndarray: The flat model with the pruned weights. + """ + num_classes = flat_model.weights.shape[1] + weights = [] + + for i in range(num_classes): + weight = flat_model.weights[:, i].toarray().ravel() + weights.append(sparse.csc_matrix(_pruning_weights(weight, pruning_alpha))) + + flat_model.weights = sparse.hstack(weights, "csc") + return flat_model + + class GridSearch: def __init__( self, @@ -202,12 +226,29 @@ def get_model(self, y, x, params): root = self.get_tree_root(y, x, params) linear_params = params.linear - self.no_cache |= (linear_params != self._cached_params.linear) - if self.no_cache: - logging.info(f"Model - Training: {linear_params}") - with __silent__(): - self._cached_params.linear = linear_params - self._cached_model = linear.train_tree(y, x, root=root, options=params.linear_options) + pruning_alpha = linear_params.pruning_alpha + + if self.no_cache or (linear_params != self._cached_params.linear): + if not self.no_cache and params.linear_options == self._cached_params.linear_options: + # The y, x, and linear_options are the same, which means the pruning_alpha is different. + # We prune the weights in-place, and the pruning_alpha is sorted in decreasing order. + # Therefore, we must divide by the previous pruning_alpha. + previous_alpha = self._cached_params.linear.pruning_alpha + pruning_alpha /= previous_alpha + logging.info(f"Model - Pruning: {linear_params}, alpha: {pruning_alpha}") + self._cached_model.flat_model = pruning_flat_model(self._cached_model.flat_model, pruning_alpha) + else: + logging.info(f"Model - Training: {linear_params}") + with __silent__(): + self._cached_model = linear.train_tree( + y, + x, + root=root, + options=params.linear_options, + pruning_alpha=pruning_alpha + ) + + self._cached_params.linear = linear_params else: logging.info(f"Model - Using cached data: {linear_params}") @@ -237,7 +278,7 @@ def __call__(self, search_space_dict: dict[str, list]) -> dict[GridParameter, di self.search_space = sorted([ GridParameter(dict(zip(param_names, param_values))) for param_values in itertools.product(*search_space_dict.values()) - ]) + ], reverse=True) permutation = np.random.permutation(self.num_instances) index_per_fold = [ diff --git a/libmultilabel/linear/linear.py b/libmultilabel/linear/linear.py index 2e7d6611..60431e5b 100644 --- a/libmultilabel/linear/linear.py +++ b/libmultilabel/linear/linear.py @@ -186,14 +186,14 @@ def del_trainer(cls): for key in list(cls.__annotations__): delattr(cls, key) - def _do_parallel_train(self, y: np.ndarray) -> np.matrix: + def _do_parallel_train(self, y: np.ndarray) -> np.ndarray: """Wrap around liblinear.liblinearutil.train. Args: y (np.ndarray): A +1/-1 array with dimensions number of instances * 1. Returns: - np.matrix: The weights. + np.ndarray: The weights. """ if y.shape[0] == 0: return np.matrix(np.zeros((self.prob.n, 1))) @@ -203,7 +203,6 @@ def _do_parallel_train(self, y: np.ndarray) -> np.matrix: model = train(prob, self.param) w = np.ctypeslib.as_array(model.w, (self.prob.n, 1)) - w = np.asmatrix(w) # When all labels are -1, we must flip the sign of the weights # because LIBLINEAR treats the first label as positive, which # is -1 in this case. But for our usage we need them to be negative. diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index b5c2ca20..a66826b8 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -308,13 +308,10 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, else: kmeans_algo = LloydKmeans - if False: - metalabels = np.random.randint(0, K, label_representation.shape[0]) - else: - kmeans = kmeans_algo( - n_clusters=K, max_iter=300, tol=0.0001, random_state=np.random.randint(2**31 - 1), verbose=False - ) - metalabels = kmeans.fit(label_representation) + kmeans = kmeans_algo( + n_clusters=K, max_iter=300, tol=0.0001, random_state=np.random.randint(2**31 - 1), verbose=False + ) + metalabels = kmeans.fit(label_representation) unique_labels = np.unique(metalabels) if len(unique_labels) == K: diff --git a/run_exp.py b/run_exp.py index b277d3c5..602dd78e 100644 --- a/run_exp.py +++ b/run_exp.py @@ -32,6 +32,7 @@ def prune_model(*args, **kwargs): 'min_df': [1, 2], 'prob_A': [2, 3], 'c': [0.1, 0.2], + 'pruning_alpha': [0.9, 0.7], } search = grid.GridSearch(dataset, n_folds, monitor_metrics) From 6cd10e4b44e685fd9fa1a8f60c293e0bd332e93e Mon Sep 17 00:00:00 2001 From: chcwww Date: Wed, 28 Jan 2026 05:26:50 +0000 Subject: [PATCH 19/23] fix the column dimension of the weights when pruning --- grid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grid.py b/grid.py index 6862b494..5f07571c 100644 --- a/grid.py +++ b/grid.py @@ -122,7 +122,7 @@ def pruning_flat_model(flat_model: linear.FlatModel, pruning_alpha: float) -> np for i in range(num_classes): weight = flat_model.weights[:, i].toarray().ravel() - weights.append(sparse.csc_matrix(_pruning_weights(weight, pruning_alpha))) + weights.append(sparse.csc_matrix(_pruning_weights(weight, pruning_alpha)[:, None])) flat_model.weights = sparse.hstack(weights, "csc") return flat_model From 9f496a7d7240ee9fdc2479a0e8f6f8fa2b238977 Mon Sep 17 00:00:00 2001 From: chcwww Date: Tue, 10 Feb 2026 06:12:10 +0000 Subject: [PATCH 20/23] remove weights pruning code --- grid.py | 58 +++++++--------------------------- libmultilabel/linear/linear.py | 41 ++---------------------- libmultilabel/linear/tree.py | 7 ++-- linear_trainer.py | 1 - main.py | 6 ---- 5 files changed, 15 insertions(+), 98 deletions(-) diff --git a/grid.py b/grid.py index 5f07571c..d5736988 100644 --- a/grid.py +++ b/grid.py @@ -7,8 +7,7 @@ import logging import libmultilabel.linear as linear -from libmultilabel.linear.tree import TreeModel, _build_tree -from libmultilabel.linear.linear import _pruning_weights +from libmultilabel.linear.tree import _build_tree import sklearn.preprocessing import numpy as np @@ -53,7 +52,6 @@ class GridParameter: ("s", int, field(default=1)), ("c", float, field(default=1)), ("B", int, field(default=-1)), - ("pruning_alpha", float, field(default=1)), ] _predict_fields = [ ("beam_width", int, field(default=10)), @@ -86,8 +84,7 @@ def __init__(self, params: dict | None = None, fold: int = -1): @property def linear_options(self): options = "" - linear_field_names = (self._param_field_names['linear'] - {'pruning_alpha'}) - for field_name in linear_field_names: + for field_name in self._param_field_names['linear']: options += f" -{field_name} {getattr(self.linear, field_name)}" return options.strip() @@ -107,27 +104,6 @@ def __hash__(self): return hash(tuple(getattr(self, t) for t in self.param_types)) -def pruning_flat_model(flat_model: linear.FlatModel, pruning_alpha: float) -> np.ndarray: - """Prune the weights of the flat model. - - Args: - flat_model (linear.FlatModel): The flat model. - pruning_alpha (float): Fraction of weights to keep after pruning. - - Returns: - np.ndarray: The flat model with the pruned weights. - """ - num_classes = flat_model.weights.shape[1] - weights = [] - - for i in range(num_classes): - weight = flat_model.weights[:, i].toarray().ravel() - weights.append(sparse.csc_matrix(_pruning_weights(weight, pruning_alpha)[:, None])) - - flat_model.weights = sparse.hstack(weights, "csc") - return flat_model - - class GridSearch: def __init__( self, @@ -226,29 +202,17 @@ def get_model(self, y, x, params): root = self.get_tree_root(y, x, params) linear_params = params.linear - pruning_alpha = linear_params.pruning_alpha if self.no_cache or (linear_params != self._cached_params.linear): - if not self.no_cache and params.linear_options == self._cached_params.linear_options: - # The y, x, and linear_options are the same, which means the pruning_alpha is different. - # We prune the weights in-place, and the pruning_alpha is sorted in decreasing order. - # Therefore, we must divide by the previous pruning_alpha. - previous_alpha = self._cached_params.linear.pruning_alpha - pruning_alpha /= previous_alpha - logging.info(f"Model - Pruning: {linear_params}, alpha: {pruning_alpha}") - self._cached_model.flat_model = pruning_flat_model(self._cached_model.flat_model, pruning_alpha) - else: - logging.info(f"Model - Training: {linear_params}") - with __silent__(): - self._cached_model = linear.train_tree( - y, - x, - root=root, - options=params.linear_options, - pruning_alpha=pruning_alpha - ) - - self._cached_params.linear = linear_params + logging.info(f"Model - Training: {linear_params}") + with __silent__(): + self._cached_params.linear = linear_params + self._cached_model = linear.train_tree( + y, + x, + root=root, + options=params.linear_options, + ) else: logging.info(f"Model - Using cached data: {linear_params}") diff --git a/libmultilabel/linear/linear.py b/libmultilabel/linear/linear.py index 60431e5b..2ecade65 100644 --- a/libmultilabel/linear/linear.py +++ b/libmultilabel/linear/linear.py @@ -92,37 +92,6 @@ def _to_dense_array(self, matrix: np.matrix | sparse.csr_matrix) -> np.ndarray: return np.asarray(matrix) -def _pruning_weights(weights: np.ndarray, pruning_alpha: float) -> np.ndarray: - """Prune the weights of the linear model. - - Args: - weights (np.ndarray): Linear model weights. - pruning_alpha (float): Fraction of weights to keep after pruning. - - Returns: - np.ndarray: The pruned weights. - """ - pruning_ratio = 1-pruning_alpha - - if 0 >= pruning_ratio: - return weights - elif pruning_ratio >= 1: - return np.zeros_like(weights) - else: - # Perform pruning algorithm - # Reduce the number of nonzero features per column by a factor of pruning_ratio. - nonzero_indices = np.flatnonzero(weights) - num_nonzeros = nonzero_indices.size - # Threshold - k = np.clip(int(pruning_ratio * num_nonzeros), 0, num_nonzeros) - k_nonzero_indices = np.argpartition(np.abs(weights[nonzero_indices]), kth=k-1)[:k] - - pruned_indices = nonzero_indices[k_nonzero_indices] - weights[pruned_indices] = 0 - - return weights - - class ParallelOVRTrainer(threading.Thread): """A trainer for parallel 1vsrest training.""" @@ -134,7 +103,6 @@ class ParallelOVRTrainer(threading.Thread): weights: np.ndarray pbar: tqdm queue: queue.SimpleQueue - pruning_alpha: float def __init__(self): threading.Thread.__init__(self) @@ -146,7 +114,6 @@ def init_trainer( x: sparse.csr_matrix, options: str, verbose: bool, - pruning_alpha: float, ): """Initialize the parallel trainer by setting y, x, parameter and threading related variables as class variables of ParallelOVRTrainer. @@ -156,13 +123,11 @@ def init_trainer( x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features. options (str): The option string passed to liblinear. verbose (bool): Output extra progress information. - pruning_alpha (float): Fraction of weights to keep after pruning. """ x, options, bias = _prepare_options(x, options) cls.y = y.tocsc() cls.x = x cls.bias = bias - cls.pruning_alpha = pruning_alpha num_instances, num_classes = cls.y.shape num_features = cls.x.shape[1] cls.prob = problem(np.ones((num_instances,)), cls.x) @@ -221,7 +186,7 @@ def run(self): except queue.Empty: break yi = self.y[:, label_idx].toarray().reshape(-1) - self.weights[:, label_idx] = _pruning_weights(self._do_parallel_train(2 * yi - 1).ravel(), self.pruning_alpha) + self.weights[:, label_idx] = self._do_parallel_train(2 * yi - 1).ravel() self.pbar.update() @@ -232,7 +197,6 @@ def train_1vsrest( multiclass: bool = False, options: str = "", verbose: bool = True, - pruning_alpha: float = 1.0, ) -> FlatModel: """Train a linear model parallel on labels for multi-label data using a one-vs-rest strategy. @@ -242,13 +206,12 @@ def train_1vsrest( multiclass (bool, optional): A flag indicating if the dataset is multiclass. options (str, optional): The option string passed to liblinear. Defaults to ''. verbose (bool, optional): Output extra progress information. Defaults to True. - pruning_alpha (float, optional): Fraction of weights to keep after pruning. Defaults to 1.0 (no pruning). Returns: A model which can be used in predict_values. """ # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/ - ParallelOVRTrainer.init_trainer(y, x, options, verbose, pruning_alpha) + ParallelOVRTrainer.init_trainer(y, x, options, verbose) num_threads = psutil.cpu_count(logical=False) trainers = [ParallelOVRTrainer() for _ in range(num_threads)] for trainer in trainers: diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index a66826b8..9dba351e 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -223,7 +223,6 @@ def train_tree( dmax=DEFAULT_DMAX, verbose: bool = True, root: Node = None, - pruning_alpha: float = 1, ) -> TreeModel: """Train a linear model for multi-label data using a divide-and-conquer strategy. The algorithm used is based on https://github.com/xmc-aalto/bonsai. @@ -236,7 +235,6 @@ def train_tree( dmax (int, optional): Maximum depth of the tree. Defaults to 10. verbose (bool, optional): Output extra progress information. Defaults to True. root (Node, optional): Pre-built tree root. Defaults to None. - pruning_alpha (float optional): Fraction of weights to keep after pruning. Defaults to 1.0 (no pruning). Returns: TreeModel: A model which can be used in predict_values. @@ -257,7 +255,6 @@ def count(node): nonlocal num_nodes num_nodes += 1 node.num_features_used = np.count_nonzero(features_used_perlabel[:, node.label_map].sum(axis=1)) - node.pruning_alpha = pruning_alpha root.dfs(count) @@ -357,14 +354,14 @@ def _train_node(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str, node: node (Node): Node to be trained. """ if node.isLeaf(): - node.model = linear.train_1vsrest(y[:, node.label_map], x, False, options, False, node.pruning_alpha) + node.model = linear.train_1vsrest(y[:, node.label_map], x, False, options, False) else: # meta_y[i, j] is 1 if the ith instance is relevant to the jth child. # getnnz returns an ndarray of shape number of instances. # This must be reshaped into number of instances * 1 to be interpreted as a column. meta_y = [y[:, child.label_map].getnnz(axis=1)[:, np.newaxis] > 0 for child in node.children] meta_y = sparse.csr_matrix(np.hstack(meta_y)) - node.model = linear.train_1vsrest(meta_y, x, False, options, False, node.pruning_alpha) + node.model = linear.train_1vsrest(meta_y, x, False, options, False) node.model.weights = sparse.csc_matrix(node.model.weights) diff --git a/linear_trainer.py b/linear_trainer.py index 1ff18584..f5f374fa 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -67,7 +67,6 @@ def linear_train(datasets, config): options=config.liblinear_options, K=config.tree_degree, dmax=config.tree_max_depth, - pruning_alpha=config.pruning_alpha, ) else: model = LINEAR_TECHNIQUES[config.linear_technique]( diff --git a/main.py b/main.py index 490c633b..b330b8b7 100644 --- a/main.py +++ b/main.py @@ -217,12 +217,6 @@ def add_all_arguments(parser): action="store_true", help="Save all the predictions with decision value larger then 0. If used, the save_k_predictions must be set to 0", ) - parser.add_argument( - "--pruning_alpha", - type=float, - default=1.0, - help="Fraction of weights to keep after pruning (1.0 means no pruning, default: %(default)s)" - ) # tree options parser.add_argument("--tree_degree", type=int, default=100, help="Degree of the tree (default: %(default)s)") From 464aa593eb5e9d8252a7dd0e35a7f9ac92969488 Mon Sep 17 00:00:00 2001 From: chcwww Date: Tue, 10 Feb 2026 06:37:29 +0000 Subject: [PATCH 21/23] move example code into main() in grid.py --- grid.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++- run_exp.py | 55 -------------------------------------------- 2 files changed, 66 insertions(+), 56 deletions(-) delete mode 100644 run_exp.py diff --git a/grid.py b/grid.py index d5736988..3ffbb23a 100644 --- a/grid.py +++ b/grid.py @@ -4,14 +4,15 @@ import os import sys import itertools +import argparse import logging import libmultilabel.linear as linear from libmultilabel.linear.tree import _build_tree +from libmultilabel.common_utils import timer import sklearn.preprocessing import numpy as np -import scipy.sparse as sparse import math @@ -273,3 +274,67 @@ def __call__(self, search_space_dict: dict[str, list]) -> dict[GridParameter, di ) return {params: metrics.compute() for params, metrics in self.param_metrics.items()} + + +@timer +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--seed", + type=int, + help="Random seed." + ) + parser.add_argument( + "--training_file", + help="Path to training data." + ) + parser.add_argument( + "--test_file", + help="Path to test data." + ) + parser.add_argument( + "--data_format", + type=str, + default="txt", + help="'svm' for SVM format or 'txt' for LibMultiLabel format." + ) + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO) + if args.seed is not None: + np.random.seed(args.seed) + + dataset = linear.load_dataset( + args.data_format, + args.training_file, + args.test_file, + ) + + retrain = True + n_folds = 3 + monitor_metrics = ["P@1", "P@3", "P@5"] + search_space_dict = { + 'max_features': [10000] + } + + search = GridSearch(dataset, n_folds, monitor_metrics) + cv_scores = search(search_space_dict) + sorted_cv_scores = sorted(cv_scores.items(), key=lambda x: x[1][monitor_metrics[0]], reverse=True) + print(sorted_cv_scores) + + if retrain: + best_params, best_cv_scores = list(sorted_cv_scores)[0] + print(best_params, best_cv_scores) + + preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf)) + transformed_dataset = preprocessor.fit_transform(dataset) + model = linear.train_tree( + transformed_dataset["train"]["y"], + transformed_dataset["train"]["x"], + best_params.linear_options, + **asdict(best_params.tree), + ) + + +if __name__ == "__main__": + main() diff --git a/run_exp.py b/run_exp.py deleted file mode 100644 index 602dd78e..00000000 --- a/run_exp.py +++ /dev/null @@ -1,55 +0,0 @@ -import libmultilabel.linear as linear -import grid - -import numpy as np -from dataclasses import asdict - - -def prune_model(*args, **kwargs): - pass - - -if __name__ == "__main__": - import argparse - import logging - - logging.basicConfig(level=logging.INFO) - np.random.seed(20260123) - - parser = argparse.ArgumentParser(description="Parse command-line arguments.") - parser.add_argument("--dataset", type=str, default="EUR-Lex", help="Dataset name (e.g., AmazonCat-13K, EUR-Lex)") - parser.add_argument("--data_format", type=str, default="txt", help="Data format.") - args = parser.parse_args() - - dataset = linear.load_dataset(args.data_format, f"data/{args.dataset}/train.{args.data_format}") # , f"data/{dataset}/test.{args.data_format}" - - retrain = True - n_folds = 3 - monitor_metrics = ["P@1", "P@3", "P@5"] - search_space_dict = { - 'max_features': [10000, 20000], - 'K': [10, 100], - 'min_df': [1, 2], - 'prob_A': [2, 3], - 'c': [0.1, 0.2], - 'pruning_alpha': [0.9, 0.7], - } - - search = grid.GridSearch(dataset, n_folds, monitor_metrics) - cv_scores = search(search_space_dict) - sorted_cv_scores = sorted(cv_scores.items(), key=lambda x: x[1][monitor_metrics[0]], reverse=True) - print(sorted_cv_scores) - - if retrain: - # TODO: test set - best_params, best_cv_scores = list(sorted_cv_scores)[0] - print(best_params, best_cv_scores) - - preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf)) - transformed_dataset = preprocessor.fit_transform(dataset) - model = linear.train_tree( - transformed_dataset["train"]["y"], - transformed_dataset["train"]["x"], - best_params.linear_options, - **asdict(best_params.tree), - ) From d7ffd2971f94c2d86a295dbbf45d85a9a311f818 Mon Sep 17 00:00:00 2001 From: chcwww Date: Tue, 10 Feb 2026 06:44:53 +0000 Subject: [PATCH 22/23] apply black formatter --- grid.py | 100 +++++++++++++-------------------- libmultilabel/linear/linear.py | 5 +- libmultilabel/linear/tree.py | 2 +- 3 files changed, 44 insertions(+), 63 deletions(-) diff --git a/grid.py b/grid.py index 3ffbb23a..e219bca3 100644 --- a/grid.py +++ b/grid.py @@ -44,20 +44,20 @@ class GridParameter: ("stop_words", str | list, field(default=None)), ("strip_accents", str | Callable, field(default=None)), ("tokenizer", Callable, field(default=None)), - ] + ] _tree_fields = [ ("dmax", int, field(default=10)), ("K", int, field(default=8)), - ] + ] _linear_fields = [ ("s", int, field(default=1)), ("c", float, field(default=1)), ("B", int, field(default=-1)), - ] + ] _predict_fields = [ ("beam_width", int, field(default=10)), ("prob_A", int, field(default=3)), - ] + ] param_types = { "tfidf": make_dataclass("TfidfParams", _tfidf_fields, frozen=True, order=True), @@ -66,8 +66,7 @@ class GridParameter: "predict": make_dataclass("PredictParams", _predict_fields, frozen=True, order=True), } _param_field_names = { - param_type: {f.name for f in fields(class_name)} - for param_type, class_name in param_types.items() + param_type: {f.name for f in fields(class_name)} for param_type, class_name in param_types.items() } def __init__(self, params: dict | None = None, fold: int = -1): @@ -85,7 +84,7 @@ def __init__(self, params: dict | None = None, fold: int = -1): @property def linear_options(self): options = "" - for field_name in self._param_field_names['linear']: + for field_name in self._param_field_names["linear"]: options += f" -{field_name} {getattr(self.linear, field_name)}" return options.strip() @@ -139,12 +138,12 @@ def take(data, idx): "data_format": self.datasets["data_format"], "train": { "y": take(self.datasets["train"]["y"], train_idx), - "x": take(self.datasets["train"]["x"], train_idx) + "x": take(self.datasets["train"]["x"], train_idx), }, "test": { "y": take(self.datasets["train"]["y"], valid_idx), - "x": take(self.datasets["train"]["x"], valid_idx) - } + "x": take(self.datasets["train"]["x"], valid_idx), + }, } def get_transformed_dataset(self, dataset, params): @@ -158,11 +157,11 @@ def get_transformed_dataset(self, dataset, params): dict[str, np.matrix]: The keys should be "y" and "x". """ tfidf_params = params.tfidf - self.no_cache = (tfidf_params != self._cached_params.tfidf) + self.no_cache = tfidf_params != self._cached_params.tfidf if self.no_cache: logging.info(f"TFIDF - Preprocessing: {tfidf_params}") if self.datasets["data_format"] not in {"txt", "dataframe"}: - logging.info('The TF-IDF parameters are only meaningful for the “txt” and “dataframe” data formats.') + logging.info("The TF-IDF parameters are only meaningful for the “txt” and “dataframe” data formats.") with __silent__(): preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params)) self._cached_params.tfidf = tfidf_params @@ -174,14 +173,16 @@ def get_transformed_dataset(self, dataset, params): def get_tree_root(self, y, x, params): tree_params = params.tree - self.no_cache |= (tree_params != self._cached_params.tree) + self.no_cache |= tree_params != self._cached_params.tree if self.no_cache: logging.info(f"Tree - Preprocessing: {tree_params}") with __silent__(): label_representation = (y.T * x).tocsr() label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) self._cached_params.tree = tree_params - self._cached_tree_root = _build_tree(label_representation, np.arange(y.shape[1]), 0, **asdict(tree_params)) + self._cached_tree_root = _build_tree( + label_representation, np.arange(y.shape[1]), 0, **asdict(tree_params) + ) self._cached_tree_root.is_root = True else: logging.info(f"Tree - Using cached data: {tree_params}") @@ -213,7 +214,7 @@ def get_model(self, y, x, params): x, root=root, options=params.linear_options, - ) + ) else: logging.info(f"Model - Using cached data: {linear_params}") @@ -230,9 +231,7 @@ def compute_scores(self, y, x, model, params): self.param_metrics[params] = linear.get_metrics(self.monitor_metrics, num_classes=y.shape[1]) for i in range(num_batches): - preds = model.predict_values( - x[i * batch_size : (i + 1) * batch_size], - **asdict(params.predict)) + preds = model.predict_values(x[i * batch_size : (i + 1) * batch_size], **asdict(params.predict)) target = y[i * batch_size : (i + 1) * batch_size].toarray() self.param_metrics[params].update(preds, target) @@ -240,19 +239,24 @@ def __call__(self, search_space_dict: dict[str, list]) -> dict[GridParameter, di self.param_metrics.clear() param_names = search_space_dict.keys() - self.search_space = sorted([ - GridParameter(dict(zip(param_names, param_values))) - for param_values in itertools.product(*search_space_dict.values()) - ], reverse=True) + self.search_space = sorted( + [ + GridParameter(dict(zip(param_names, param_values))) + for param_values in itertools.product(*search_space_dict.values()) + ], + reverse=True, + ) permutation = np.random.permutation(self.num_instances) index_per_fold = [ - permutation[int(fold * self.num_instances / self.n_folds):int((fold+1) * self.num_instances / self.n_folds)] + permutation[ + int(fold * self.num_instances / self.n_folds) : int((fold + 1) * self.num_instances / self.n_folds) + ] for fold in range(self.n_folds) ] for fold in range(self.n_folds): - train_idx = np.concatenate(index_per_fold[:fold] + index_per_fold[fold+1:]) + train_idx = np.concatenate(index_per_fold[:fold] + index_per_fold[fold + 1 :]) valid_idx = index_per_fold[fold] fold_dataset = self.get_fold_dataset(train_idx, valid_idx) @@ -261,17 +265,8 @@ def __call__(self, search_space_dict: dict[str, list]) -> dict[GridParameter, di logging.info(f"Status - Running fold {fold}, params: {params}") transformed_dataset = self.get_transformed_dataset(fold_dataset, params) - model = self.get_model( - transformed_dataset["train"]["y"], - transformed_dataset["train"]["x"], - params - ) - self.compute_scores( - transformed_dataset["test"]["y"], - transformed_dataset["test"]["x"], - model, - params - ) + model = self.get_model(transformed_dataset["train"]["y"], transformed_dataset["train"]["x"], params) + self.compute_scores(transformed_dataset["test"]["y"], transformed_dataset["test"]["x"], model, params) return {params: metrics.compute() for params, metrics in self.param_metrics.items()} @@ -279,24 +274,11 @@ def __call__(self, search_space_dict: dict[str, list]) -> dict[GridParameter, di @timer def main(): parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, help="Random seed.") + parser.add_argument("--training_file", help="Path to training data.") + parser.add_argument("--test_file", help="Path to test data.") parser.add_argument( - "--seed", - type=int, - help="Random seed." - ) - parser.add_argument( - "--training_file", - help="Path to training data." - ) - parser.add_argument( - "--test_file", - help="Path to test data." - ) - parser.add_argument( - "--data_format", - type=str, - default="txt", - help="'svm' for SVM format or 'txt' for LibMultiLabel format." + "--data_format", type=str, default="txt", help="'svm' for SVM format or 'txt' for LibMultiLabel format." ) args = parser.parse_args() @@ -313,9 +295,7 @@ def main(): retrain = True n_folds = 3 monitor_metrics = ["P@1", "P@3", "P@5"] - search_space_dict = { - 'max_features': [10000] - } + search_space_dict = {"max_features": [10000]} search = GridSearch(dataset, n_folds, monitor_metrics) cv_scores = search(search_space_dict) @@ -329,11 +309,11 @@ def main(): preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf)) transformed_dataset = preprocessor.fit_transform(dataset) model = linear.train_tree( - transformed_dataset["train"]["y"], - transformed_dataset["train"]["x"], - best_params.linear_options, - **asdict(best_params.tree), - ) + transformed_dataset["train"]["y"], + transformed_dataset["train"]["x"], + best_params.linear_options, + **asdict(best_params.tree), + ) if __name__ == "__main__": diff --git a/libmultilabel/linear/linear.py b/libmultilabel/linear/linear.py index 2ecade65..04d25a21 100644 --- a/libmultilabel/linear/linear.py +++ b/libmultilabel/linear/linear.py @@ -151,14 +151,14 @@ def del_trainer(cls): for key in list(cls.__annotations__): delattr(cls, key) - def _do_parallel_train(self, y: np.ndarray) -> np.ndarray: + def _do_parallel_train(self, y: np.ndarray) -> np.matrix: """Wrap around liblinear.liblinearutil.train. Args: y (np.ndarray): A +1/-1 array with dimensions number of instances * 1. Returns: - np.ndarray: The weights. + np.matrix: The weights. """ if y.shape[0] == 0: return np.matrix(np.zeros((self.prob.n, 1))) @@ -168,6 +168,7 @@ def _do_parallel_train(self, y: np.ndarray) -> np.ndarray: model = train(prob, self.param) w = np.ctypeslib.as_array(model.w, (self.prob.n, 1)) + w = np.asmatrix(w) # When all labels are -1, we must flip the sign of the weights # because LIBLINEAR treats the first label as positive, which # is -1 in this case. But for our usage we need them to be negative. diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 9dba351e..eb4934eb 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -306,7 +306,7 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray, kmeans_algo = LloydKmeans kmeans = kmeans_algo( - n_clusters=K, max_iter=300, tol=0.0001, random_state=np.random.randint(2**31 - 1), verbose=False + n_clusters=K, max_iter=300, tol=0.0001, random_state=np.random.randint(2**31 - 1), verbose=True ) metalabels = kmeans.fit(label_representation) From 70fdfcb6d52cb3f0abe1c0b37b549f464e527a81 Mon Sep 17 00:00:00 2001 From: chcwww Date: Tue, 10 Feb 2026 09:02:09 +0000 Subject: [PATCH 23/23] update search_space_dict in the grid search example --- grid.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/grid.py b/grid.py index e219bca3..2a83b3c6 100644 --- a/grid.py +++ b/grid.py @@ -295,7 +295,13 @@ def main(): retrain = True n_folds = 3 monitor_metrics = ["P@1", "P@3", "P@5"] - search_space_dict = {"max_features": [10000]} + search_space_dict = { + "max_features": [10000, 20000, 100000], + "K": [10, 50, 100], + "min_df": [1, 2], + "prob_A": [2, 3, 4], + "c": [0.1, 0.2, 1, 10], + } search = GridSearch(dataset, n_folds, monitor_metrics) cv_scores = search(search_space_dict)