From bffa43f9b9c08a506beece00c525a7c48bd217d6 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Tue, 22 Jul 2025 16:55:39 +0800 Subject: [PATCH 1/9] Draft first runnable version: remove torchtext from nn/data_utils.py --- libmultilabel/nn/data_utils.py | 176 +++++++++++++++++++++------------ main.py | 1 + 2 files changed, 115 insertions(+), 62 deletions(-) diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py index 1d48bb06..b6e104f9 100644 --- a/libmultilabel/nn/data_utils.py +++ b/libmultilabel/nn/data_utils.py @@ -1,7 +1,12 @@ import csv import gc import logging +import os +import re import warnings +import zipfile +from urllib.request import urlretrieve +from collections import Counter, OrderedDict import pandas as pd import torch @@ -11,7 +16,6 @@ from sklearn.preprocessing import MultiLabelBinarizer from torch.nn.utils.rnn import pad_sequence from torch.utils.data import Dataset -from torchtext.vocab import build_vocab_from_iterator, pretrained_aliases, Vocab from tqdm import tqdm transformers.logging.set_verbosity_error() @@ -19,6 +23,14 @@ UNK = "" PAD = "" +PRETRAINED_ALIASES = { + "glove.42B.300d", + "glove.840B.300d", + "glove.6B.50d", + "glove.6B.100d", + "glove.6B.200d", + "glove.6B.300d", +} class TextDataset(Dataset): @@ -31,8 +43,7 @@ class TextDataset(Dataset): add_special_tokens (bool, optional): Whether to add the special tokens. Defaults to True. tokenizer (transformers.PreTrainedTokenizerBase, optional): HuggingFace's tokenizer of the transformer-based pretrained language model. Defaults to None. - word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to - map tokens to indices. Defaults to None. + word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None. """ def __init__( @@ -55,7 +66,7 @@ def __init__( self.num_classes = len(self.classes) self.label_binarizer = MultiLabelBinarizer().fit([classes]) - if not isinstance(self.word_dict, Vocab) ^ isinstance(self.tokenizer, transformers.PreTrainedTokenizerBase): + if not isinstance(self.word_dict, dict) ^ isinstance(self.tokenizer, transformers.PreTrainedTokenizerBase): raise ValueError("Please specify exactly one of word_dict or tokenizer") def __len__(self): @@ -71,7 +82,7 @@ def __getitem__(self, index): else: input_ids = self.tokenizer.encode(data["text"], add_special_tokens=False) else: - input_ids = [self.word_dict[word] for word in data["text"]] + input_ids = [self.word_dict.get(word, self.word_dict[UNK]) for word in data["text"]] return { "text": torch.LongTensor(input_ids[: self.max_seq_length]), "label": torch.IntTensor(self.label_binarizer.transform([data["label"]])[0]), @@ -128,8 +139,7 @@ def get_dataset_loader( add_special_tokens (bool, optional): Whether to add the special tokens. Defaults to True. tokenizer (transformers.PreTrainedTokenizerBase, optional): HuggingFace's tokenizer of the transformer-based pretrained language model. Defaults to None. - word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to - map tokens to indices. Defaults to None. + word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None. Returns: torch.utils.data.DataLoader: A pytorch DataLoader. @@ -276,9 +286,9 @@ def load_or_build_text_dict( embed_cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None. silent (bool, optional): Enable silent mode. Defaults to False. normalize_embed (bool, optional): Whether the embeddings of each word is normalized to a unit vector. Defaults to False. - + Returns: - tuple[torchtext.vocab.Vocab, torch.Tensor]: A vocab object which maps tokens to indices and the pre-trained word vectors of shape (vocab_size, embed_dim). + tuple[dict, torch.Tensor]: A dictionary which maps tokens to indices and the pre-trained word vectors of shape (vocab_size, embed_dim). """ if vocab_file: logging.info(f"Load vocab from {vocab_file}") @@ -286,14 +296,14 @@ def load_or_build_text_dict( vocab_list = [[vocab.strip() for vocab in fp.readlines()]] # Keep PAD index 0 to align `padding_idx` of # class Embedding in libmultilabel.nn.networks.modules. - vocabs = build_vocab_from_iterator(vocab_list, min_freq=1, specials=[PAD, UNK]) + word_dict = _build_word_dict(vocab_list, min_vocab_freq=1, specials=[PAD, UNK]) else: vocab_list = [set(data["text"]) for data in dataset] - vocabs = build_vocab_from_iterator(vocab_list, min_freq=min_vocab_freq, specials=[PAD, UNK]) - vocabs.set_default_index(vocabs[UNK]) - logging.info(f"Read {len(vocabs)} vocabularies.") + word_dict = _build_word_dict(vocab_list, min_vocab_freq=min_vocab_freq, specials=[PAD, UNK]) # we don't need min_vocab_freq as we use set + + logging.info(f"Read {len(word_dict)} vocabularies.") # TBD: check if pad unk is included - embedding_weights = get_embedding_weights_from_file(vocabs, embed_file, silent, embed_cache_dir) + embedding_weights = get_embedding_weights_from_file(word_dict, embed_file, silent, embed_cache_dir) if normalize_embed: # To have better precision for calculating the normalization, we convert the original @@ -306,7 +316,40 @@ def load_or_build_text_dict( embedding_weights[i] = vector / float(torch.linalg.norm(vector) + 1e-6) embedding_weights = embedding_weights.float() - return vocabs, embedding_weights + return word_dict, embedding_weights + + +def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None): + r"""Build word dictionary, modified from torchtext.vocab.build-vocab-from-iterator (https://docs.pytorch.org/text/stable/vocab.html#build-vocab-from-iterator) + + Args: + vocab_list: List of words. + min_vocab_freq (int, optional): The minimum frequency needed to include a token in the vocabulary. Defaults to 1. + specials: Special tokens (e.g., , ) to add. + + Returns: + dict: A dictionary which maps tokens to indices. + """ + + counter = Counter() + for tokens in vocab_list: + counter.update(tokens) + + # sort by descending frequency, then lexicographically + sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0])) + ordered_dict = OrderedDict(sorted_by_freq_tuples) + + # add special tokens at the beginning + tokens = specials or [] + for token, freq in ordered_dict.items(): + if freq >= min_vocab_freq: + tokens.append(token) + + # build token to indices dict + word_dict = dict() + for idx, token in enumerate(tokens): + word_dict[token] = idx + return word_dict def load_or_build_label(datasets, label_file=None, include_test_labels=False): @@ -344,70 +387,79 @@ def load_or_build_label(datasets, label_file=None, include_test_labels=False): return classes -def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=None): +def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_dir=None): """If the word exists in the embedding file, load the pretrained word embedding. Otherwise, assign a zero vector to that word. Args: - word_dict (torchtext.vocab.Vocab): A vocab object which maps tokens to indices. + word_dict (dict): A vocab object which maps tokens to indices. embed_file (str): Path to a file holding pre-trained embeddings. silent (bool, optional): Enable silent mode. Defaults to False. - cache (str, optional): Path to a directory for storing cached embeddings. Defaults to None. + cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None. Returns: torch.Tensor: Embedding weights (vocab_size, embed_size). """ - # Load pretrained word embedding - load_embedding_from_file = embed_file not in pretrained_aliases - if load_embedding_from_file: - logging.info(f"Load pretrained embedding from file: {embed_file}.") - with open(embed_file) as f: - word_vectors = f.readlines() - embed_size = len(word_vectors[0].split()) - 1 - vector_dict = {} - for word_vector in tqdm(word_vectors, disable=silent): - word, vector = word_vector.rstrip().split(" ", 1) - vector = torch.Tensor(list(map(float, vector.split()))) - vector_dict[word] = vector - else: - logging.info(f"Load pretrained embedding from torchtext.") - # Adapted from https://pytorch.org/text/0.9.0/_modules/torchtext/vocab.html#Vocab.load_vectors. - if embed_file not in pretrained_aliases: - raise ValueError( - "Got embed_file {}, but allowed pretrained " - "vectors are {}".format(embed_file, list(pretrained_aliases.keys())) - ) - - # Hotfix: Glove URLs are outdated in Torchtext - # (https://github.com/pytorch/text/blob/main/torchtext/vocab/vectors.py#L213-L217) - pretrained_cls = pretrained_aliases[embed_file] - if embed_file.startswith("glove"): - for name, url in pretrained_cls.func.url.items(): - file_name = url.split("/")[-1] - pretrained_cls.func.url[name] = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{file_name}" - - vector_dict = pretrained_cls(cache=cache) - embed_size = vector_dict.dim - embedding_weights = torch.zeros(len(word_dict), embed_size) + if embed_file in PRETRAINED_ALIASES: + embed_file = _download_pretrained_embedding(embed_file, cache_dir=cache_dir) + elif not os.path.isfile(embed_file): + raise ValueError( + "Got embed_file {}, but allowed pretrained " + "embeddings are {}".format(embed_file, PRETRAINED_ALIASES) + ) + + logging.info(f"Load pretrained embedding from file: {embed_file}.") + with open(embed_file) as f: + word_vectors = f.readlines() + embed_size = len(word_vectors[0].split()) - 1 + + vector_dict = {} + for word_vector in tqdm(word_vectors, disable=silent): + word, vector = word_vector.rstrip().split(" ", 1) + vector = torch.Tensor(list(map(float, vector.split()))) + vector_dict[word] = vector - if load_embedding_from_file: - # Add UNK embedding - # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size) - # CAML: np.random.randn(embed_size) - unk_vector = torch.randn(embed_size) - embedding_weights[word_dict[UNK]] = unk_vector + embedding_weights = torch.zeros(len(word_dict), embed_size) + # Add UNK embedding + # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size) + # CAML: np.random.randn(embed_size) + unk_vector = torch.randn(embed_size) + embedding_weights[word_dict[UNK]] = unk_vector # Store pretrained word embedding vec_counts = 0 - for word in word_dict.get_itos(): - # The condition can be used to process the word that does not in the embedding file. - # Note that torchtext vector object has already dealt with this, - # so we can directly make a query without addtional handling. - if (load_embedding_from_file and word in vector_dict) or not load_embedding_from_file: + # for word in word_dict.get_itos(): # list of words + for word in word_dict.keys(): + if word in vector_dict: embedding_weights[word_dict[word]] = vector_dict[word] - vec_counts += 1 + vec_counts += 1 logging.info(f"loaded {vec_counts}/{len(word_dict)} word embeddings") return embedding_weights + + +def _download_pretrained_embedding(embed_file, cache_dir=None): + """Download pretrained glove embedding from https://huggingface.co/stanfordnlp/glove/tree/main. + + Returns: + str: Path to the cached or downloaded embedding file. + """ + cached_embed_file = f"{cache_dir}/{embed_file}.txt" + if os.path.isfile(cached_embed_file): + return cached_embed_file + os.makedirs(cache_dir, exist_ok=True) + + remote_embed_file = re.sub(r"6B.*", "6B", embed_file) + ".zip" + url = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{remote_embed_file}" + logging.info(f"Downloading pretrained embedding from {url}.") + try: + zip_file, _ = urlretrieve(url, f"{cache_dir}/{remote_embed_file}") + with zipfile.ZipFile(zip_file, "r") as zf: + zf.extractall(cache_dir) + except Exception as e: + os.remove(zip_file) + raise e + + return cached_embed_file diff --git a/main.py b/main.py index 12564f6b..d87fbdf9 100644 --- a/main.py +++ b/main.py @@ -189,6 +189,7 @@ def add_all_arguments(parser): parser.add_argument( "--embed_cache_dir", type=str, + default=".vector_cache", help="For parameter search only: path to a directory for storing embeddings for multiple runs. (default: %(default)s)", ) parser.add_argument( From bb2261db5b5aab21eb8439fa121f57df57dc0258 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Thu, 24 Jul 2025 09:27:34 +0800 Subject: [PATCH 2/9] Update comments related to torchtext. --- docs/cli/nn.rst | 3 ++- docs/examples/plot_KimCNN_quickstart.py | 2 +- libmultilabel/nn/data_utils.py | 23 ++++++++++------------- libmultilabel/nn/model.py | 2 +- libmultilabel/nn/nn_utils.py | 3 +-- search_params.py | 4 ++-- tests/nn/components.py | 2 +- torch_trainer.py | 4 ++-- 8 files changed, 20 insertions(+), 23 deletions(-) diff --git a/docs/cli/nn.rst b/docs/cli/nn.rst index efe2f5f2..7391138d 100644 --- a/docs/cli/nn.rst +++ b/docs/cli/nn.rst @@ -77,7 +77,8 @@ If a model was trained before by this package, the training procedure can start To use your own word embeddings or vocabulary set, specify the following parameters: -- **embed_file**: choose one of the pretrained embeddings defined in `torchtext `_ or specify the path to your word embeddings with each line containing a word followed by its vectors. Example: +- **embed_file**: choose one of the pretrained embeddings: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, +`glove.42B.300d`, `glove.840B.300d`, or specify the path to your word embeddings with each line containing a word followed by its vectors. .. code-block:: diff --git a/docs/examples/plot_KimCNN_quickstart.py b/docs/examples/plot_KimCNN_quickstart.py index 39efd6ba..49ae1f0d 100644 --- a/docs/examples/plot_KimCNN_quickstart.py +++ b/docs/examples/plot_KimCNN_quickstart.py @@ -32,7 +32,7 @@ # To run KimCNN, LibMultiLabel tokenizes documents and uses an embedding vector for each word. # Thus, ``tokenize_text=True`` is set. # -# We choose ``glove.6B.300d`` from torchtext as embedding vectors. +# We choose ``glove.6B.300d`` as embedding vectors. datasets = load_datasets("data/rcv1/train.txt", "data/rcv1/test.txt", tokenize_text=True) classes = load_or_build_label(datasets) diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py index b6e104f9..6d9c3f95 100644 --- a/libmultilabel/nn/data_utils.py +++ b/libmultilabel/nn/data_utils.py @@ -275,8 +275,7 @@ def load_or_build_text_dict( ): """Build or load the vocabulary from the training dataset or the predefined `vocab_file`. The pretrained embedding can be either from a self-defined `embed_file` or from one of - the vectors defined in torchtext.vocab.pretrained_aliases - (https://github.com/pytorch/text/blob/main/torchtext/vocab/vectors.py). + the vectors: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, `glove.840B.300d`. Args: dataset (list): List of training instances with index, label, and tokenized text. @@ -286,7 +285,7 @@ def load_or_build_text_dict( embed_cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None. silent (bool, optional): Enable silent mode. Defaults to False. normalize_embed (bool, optional): Whether the embeddings of each word is normalized to a unit vector. Defaults to False. - + Returns: tuple[dict, torch.Tensor]: A dictionary which maps tokens to indices and the pre-trained word vectors of shape (vocab_size, embed_dim). """ @@ -299,9 +298,9 @@ def load_or_build_text_dict( word_dict = _build_word_dict(vocab_list, min_vocab_freq=1, specials=[PAD, UNK]) else: vocab_list = [set(data["text"]) for data in dataset] - word_dict = _build_word_dict(vocab_list, min_vocab_freq=min_vocab_freq, specials=[PAD, UNK]) # we don't need min_vocab_freq as we use set + word_dict = _build_word_dict(vocab_list, min_vocab_freq=min_vocab_freq, specials=[PAD, UNK]) - logging.info(f"Read {len(word_dict)} vocabularies.") # TBD: check if pad unk is included + logging.info(f"Read {len(word_dict)} vocabularies.") embedding_weights = get_embedding_weights_from_file(word_dict, embed_file, silent, embed_cache_dir) @@ -334,7 +333,7 @@ def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None): counter = Counter() for tokens in vocab_list: counter.update(tokens) - + # sort by descending frequency, then lexicographically sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0])) ordered_dict = OrderedDict(sorted_by_freq_tuples) @@ -392,7 +391,7 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_d Otherwise, assign a zero vector to that word. Args: - word_dict (dict): A vocab object which maps tokens to indices. + word_dict (dict): A dictionary for mapping tokens to indices. embed_file (str): Path to a file holding pre-trained embeddings. silent (bool, optional): Enable silent mode. Defaults to False. cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None. @@ -405,15 +404,14 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_d embed_file = _download_pretrained_embedding(embed_file, cache_dir=cache_dir) elif not os.path.isfile(embed_file): raise ValueError( - "Got embed_file {}, but allowed pretrained " - "embeddings are {}".format(embed_file, PRETRAINED_ALIASES) + "Got embed_file {}, but allowed pretrained " "embeddings are {}".format(embed_file, PRETRAINED_ALIASES) ) logging.info(f"Load pretrained embedding from file: {embed_file}.") with open(embed_file) as f: word_vectors = f.readlines() embed_size = len(word_vectors[0].split()) - 1 - + vector_dict = {} for word_vector in tqdm(word_vectors, disable=silent): word, vector = word_vector.rstrip().split(" ", 1) @@ -429,7 +427,6 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_d # Store pretrained word embedding vec_counts = 0 - # for word in word_dict.get_itos(): # list of words for word in word_dict.keys(): if word in vector_dict: embedding_weights[word_dict[word]] = vector_dict[word] @@ -448,9 +445,9 @@ def _download_pretrained_embedding(embed_file, cache_dir=None): """ cached_embed_file = f"{cache_dir}/{embed_file}.txt" if os.path.isfile(cached_embed_file): - return cached_embed_file + return cached_embed_file os.makedirs(cache_dir, exist_ok=True) - + remote_embed_file = re.sub(r"6B.*", "6B", embed_file) + ".zip" url = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{remote_embed_file}" logging.info(f"Downloading pretrained embedding from {url}.") diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py index a1dcf070..f7f76439 100644 --- a/libmultilabel/nn/model.py +++ b/libmultilabel/nn/model.py @@ -181,7 +181,7 @@ class Model(MultiLabelModel): Args: classes (list): List of class names. - word_dict (torchtext.vocab.Vocab): A vocab object which maps tokens to indices. + word_dict (dict): A dictionary for mapping tokens to indices. network (nn.Module): Network (i.e., CAML, KimCNN, or XMLCNN). loss_function (str, optional): Loss function name (i.e., binary_cross_entropy_with_logits, cross_entropy). Defaults to 'binary_cross_entropy_with_logits'. diff --git a/libmultilabel/nn/nn_utils.py b/libmultilabel/nn/nn_utils.py index a4ac82c2..f9107d01 100644 --- a/libmultilabel/nn/nn_utils.py +++ b/libmultilabel/nn/nn_utils.py @@ -61,8 +61,7 @@ def init_model( model_name (str): Model to be used such as KimCNN. network_config (dict): Configuration for defining the network. classes (list): List of class names. - word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to - map tokens to indices. Defaults to None. + word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None. embed_vecs (torch.Tensor, optional): The pre-trained word vectors of shape (vocab_size, embed_dim). Defaults to None. init_weight (str): Weight initialization method from `torch.nn.init`. diff --git a/search_params.py b/search_params.py index aad38ece..c4d8b9db 100644 --- a/search_params.py +++ b/search_params.py @@ -25,8 +25,8 @@ def train_libmultilabel_tune(config, datasets, classes, word_dict): Args: config (dict): Config of the experiment. datasets (dict): A dictionary of datasets. - classes(list): List of class names. - word_dict(torchtext.vocab.Vocab): A vocab object which maps tokens to indices. + classes (list): List of class names. + word_dict (dict): A dictionary for mapping tokens to indices. """ # ray convert AttributeDict to dict diff --git a/tests/nn/components.py b/tests/nn/components.py index b74a1c15..bcfbcd68 100644 --- a/tests/nn/components.py +++ b/tests/nn/components.py @@ -20,7 +20,7 @@ def get_name(self): return "token_to_id" def get_from_trainer(self, trainer): - return trainer.model.word_dict.get_stoi() + return trainer.model.word_dict def compare(self, a, b): return a == b diff --git a/torch_trainer.py b/torch_trainer.py index 8dc259b5..a7f0641d 100644 --- a/torch_trainer.py +++ b/torch_trainer.py @@ -19,7 +19,7 @@ class TorchTrainer: config (AttributeDict): Config of the experiment. datasets (dict, optional): Datasets for training, validation, and test. Defaults to None. classes(list, optional): List of class names. - word_dict(torchtext.vocab.Vocab, optional): A vocab object which maps tokens to indices. + word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None. embed_vecs (torch.Tensor, optional): The pre-trained word vectors of shape (vocab_size, embed_dim). save_checkpoints (bool, optional): Whether to save the last and the best checkpoint or not. Defaults to True. @@ -136,7 +136,7 @@ def _setup_model( Args: classes(list): List of class names. - word_dict(torchtext.vocab.Vocab): A vocab object which maps tokens to indices. + word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None. embed_vecs (torch.Tensor): The pre-trained word vectors of shape (vocab_size, embed_dim). log_path (str): Path to the log file. The log file contains the validation results for each epoch and the test results. If the `log_path` is None, no performance From e19d93435b6fe0b56ab780663533c38a40ee789d Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Thu, 24 Jul 2025 10:57:59 +0800 Subject: [PATCH 3/9] restore indent --- libmultilabel/nn/data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py index 6d9c3f95..fdd56d09 100644 --- a/libmultilabel/nn/data_utils.py +++ b/libmultilabel/nn/data_utils.py @@ -430,7 +430,7 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_d for word in word_dict.keys(): if word in vector_dict: embedding_weights[word_dict[word]] = vector_dict[word] - vec_counts += 1 + vec_counts += 1 logging.info(f"loaded {vec_counts}/{len(word_dict)} word embeddings") From 39bdde8e16a5fdbd2ae88714a1836e04b6c59a8a Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Fri, 25 Jul 2025 17:28:45 +0800 Subject: [PATCH 4/9] (1) Update versions: torch, transformers (2) fix UI newline --- README.md | 2 +- docs/cli/nn.rst | 3 +-- requirements_nn.txt | 6 ++---- setup.cfg | 10 ++++++---- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 33240b65..3e4e7d41 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ This is an on-going development so many improvements are still being made. Comme ## Environments - Python: 3.10+ -- CUDA: 11.8, 12.1 (if training neural networks by GPU) +- CUDA: 11.8, 12.6 (if training neural networks by GPU) - Pytorch: 2.0.1+ If you have a different version of CUDA, follow the installation instructions for PyTorch LTS at their [website](https://pytorch.org/). diff --git a/docs/cli/nn.rst b/docs/cli/nn.rst index 7391138d..41eb9a54 100644 --- a/docs/cli/nn.rst +++ b/docs/cli/nn.rst @@ -77,8 +77,7 @@ If a model was trained before by this package, the training procedure can start To use your own word embeddings or vocabulary set, specify the following parameters: -- **embed_file**: choose one of the pretrained embeddings: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, -`glove.42B.300d`, `glove.840B.300d`, or specify the path to your word embeddings with each line containing a word followed by its vectors. +- **embed_file**: choose one of the pretrained embeddings: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, `glove.840B.300d`, or specify the path to your word embeddings with each line containing a word followed by its vectors. .. code-block:: diff --git a/requirements_nn.txt b/requirements_nn.txt index 2c0da6a4..edd34545 100644 --- a/requirements_nn.txt +++ b/requirements_nn.txt @@ -1,8 +1,6 @@ nltk lightning # https://github.com/pytorch/text/releases -torch<=2.3 +torch torchmetrics==0.10.3 -torchtext -# https://github.com/huggingface/transformers/issues/38464 -transformers<=4.51.3 +transformers diff --git a/setup.cfg b/setup.cfg index a676ce91..c8a6cb3d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,7 +12,7 @@ project_urls = Documentation = https://www.csie.ntu.edu.tw/~cjlin/libmultilabel Source Code = https://github.com/ASUS-AICS/LibMultiLabel/ classifiers = - Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.1 + Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.6 Environment :: GPU :: NVIDIA CUDA :: 11.8 Intended Audience :: Developers Intended Audience :: Education @@ -21,6 +21,9 @@ classifiers = Operating System :: OS Independent Programming Language :: Python :: 3 Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 + Programming Language :: Python :: 3.13 [options] packages = find: @@ -40,10 +43,9 @@ python_requires = >=3.10 nn = lightning nltk - torch<=2.3 + torch torchmetrics==0.10.3 - torchtext - transformers<=4.51.3 + transformers [options.packages.find] From 2fd112cec830bfd499143cdc1bfef5f369bdbbfe Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Sat, 26 Jul 2025 20:40:59 +0800 Subject: [PATCH 5/9] Fix attentionXML: get UNK like torchtext --- libmultilabel/nn/attentionxml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmultilabel/nn/attentionxml.py b/libmultilabel/nn/attentionxml.py index b54776ac..747f1b05 100644 --- a/libmultilabel/nn/attentionxml.py +++ b/libmultilabel/nn/attentionxml.py @@ -489,7 +489,7 @@ def reformat_text(self, dataset): # Convert words to numbers according to their indices in word_dict. Then pad each instance to a certain length. encoded_text = list( map( - lambda text: torch.tensor([self.word_dict[word] for word in text], dtype=torch.int64) + lambda text: torch.tensor([self.word_dict.get(word, self.word_dict[UNK]) for word in text], dtype=torch.int64) if text else torch.tensor([self.word_dict[UNK]], dtype=torch.int64), [instance["text"][: self.max_seq_length] for instance in dataset], From 5db5943f8676a5d48775483eb2d8fbf53a8c5137 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Sun, 27 Jul 2025 17:32:17 +0800 Subject: [PATCH 6/9] Update CUDA and Pytorch versions. --- README.md | 4 ++-- libmultilabel/nn/data_utils.py | 7 ++++--- setup.cfg | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 3e4e7d41..5504daaa 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,8 @@ This is an on-going development so many improvements are still being made. Comme ## Environments - Python: 3.10+ -- CUDA: 11.8, 12.6 (if training neural networks by GPU) -- Pytorch: 2.0.1+ +- CUDA: 11.8, 12.1, 12.6 (if training neural networks by GPU) +- Pytorch: 2.3.0+ If you have a different version of CUDA, follow the installation instructions for PyTorch LTS at their [website](https://pytorch.org/). diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py index fdd56d09..e14bee06 100644 --- a/libmultilabel/nn/data_utils.py +++ b/libmultilabel/nn/data_utils.py @@ -275,7 +275,7 @@ def load_or_build_text_dict( ): """Build or load the vocabulary from the training dataset or the predefined `vocab_file`. The pretrained embedding can be either from a self-defined `embed_file` or from one of - the vectors: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, `glove.840B.300d`. + the vectors: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, or `glove.840B.300d`. Args: dataset (list): List of training instances with index, label, and tokenized text. @@ -319,7 +319,8 @@ def load_or_build_text_dict( def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None): - r"""Build word dictionary, modified from torchtext.vocab.build-vocab-from-iterator (https://docs.pytorch.org/text/stable/vocab.html#build-vocab-from-iterator) + r"""Build word dictionary, modified from `torchtext.vocab.build-vocab-from-iterator` + (https://docs.pytorch.org/text/stable/vocab.html#build-vocab-from-iterator) Args: vocab_list: List of words. @@ -338,7 +339,7 @@ def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None): sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0])) ordered_dict = OrderedDict(sorted_by_freq_tuples) - # add special tokens at the beginning + # add special tokens at the beginning tokens = specials or [] for token, freq in ordered_dict.items(): if freq >= min_vocab_freq: diff --git a/setup.cfg b/setup.cfg index c8a6cb3d..290089c6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,7 +12,7 @@ project_urls = Documentation = https://www.csie.ntu.edu.tw/~cjlin/libmultilabel Source Code = https://github.com/ASUS-AICS/LibMultiLabel/ classifiers = - Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.6 + Environment :: GPU :: NVIDIA CUDA :: 12 Environment :: GPU :: NVIDIA CUDA :: 11.8 Intended Audience :: Developers Intended Audience :: Education From 948f6274fd0fad7e9d806fe62b5508703904b280 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Tue, 29 Jul 2025 19:00:43 +0800 Subject: [PATCH 7/9] Update data_utils.py: (1) finalize doc strings (2) move ".vector_cache" from main to _down.. for API --- docs/cli/nn.rst | 2 +- libmultilabel/nn/data_utils.py | 46 ++++++++++++++++++++-------------- main.py | 3 +-- 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/docs/cli/nn.rst b/docs/cli/nn.rst index 41eb9a54..102231fa 100644 --- a/docs/cli/nn.rst +++ b/docs/cli/nn.rst @@ -77,7 +77,7 @@ If a model was trained before by this package, the training procedure can start To use your own word embeddings or vocabulary set, specify the following parameters: -- **embed_file**: choose one of the pretrained embeddings: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, `glove.840B.300d`, or specify the path to your word embeddings with each line containing a word followed by its vectors. +- **embed_file**: choose one of the pretrained embeddings: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, `glove.840B.300d`, or specify the path to your word embeddings with each line containing a word followed by its vectors. Example: .. code-block:: diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py index e14bee06..1907ee53 100644 --- a/libmultilabel/nn/data_utils.py +++ b/libmultilabel/nn/data_utils.py @@ -23,7 +23,7 @@ UNK = "" PAD = "" -PRETRAINED_ALIASES = { +GLOVE_WORD_EMBEDDING = { "glove.42B.300d", "glove.840B.300d", "glove.6B.50d", @@ -164,6 +164,7 @@ def _load_raw_data(data, is_test=False, tokenize_text=True, remove_no_label_data Args: data (Union[str, pandas,.Dataframe]): Training, test, or validation data in file or dataframe. is_test (bool, optional): Whether the data is for test or not. Defaults to False. + tokenize_text (bool, optional): Whether to tokenize text. Defaults to True. remove_no_label_data (bool, optional): Whether to remove training/validation instances that have no labels. This is effective only when is_test=False. Defaults to False. @@ -281,7 +282,7 @@ def load_or_build_text_dict( dataset (list): List of training instances with index, label, and tokenized text. vocab_file (str, optional): Path to a file holding vocabuaries. Defaults to None. min_vocab_freq (int, optional): The minimum frequency needed to include a token in the vocabulary. Defaults to 1. - embed_file (str): Path to a file holding pre-trained embeddings. + embed_file (str): Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding. Defaults to None. embed_cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None. silent (bool, optional): Enable silent mode. Defaults to False. normalize_embed (bool, optional): Whether the embeddings of each word is normalized to a unit vector. Defaults to False. @@ -319,13 +320,13 @@ def load_or_build_text_dict( def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None): - r"""Build word dictionary, modified from `torchtext.vocab.build-vocab-from-iterator` + r"""Build word dictionary, modified from `torchtext.vocab.build-vocab-from-iterator` (https://docs.pytorch.org/text/stable/vocab.html#build-vocab-from-iterator) Args: vocab_list: List of words. min_vocab_freq (int, optional): The minimum frequency needed to include a token in the vocabulary. Defaults to 1. - specials: Special tokens (e.g., , ) to add. + specials: Special tokens (e.g., , ) to add. Defaults to None. Returns: dict: A dictionary which maps tokens to indices. @@ -339,7 +340,7 @@ def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None): sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0])) ordered_dict = OrderedDict(sorted_by_freq_tuples) - # add special tokens at the beginning + # add special tokens at the beginning tokens = specials or [] for token, freq in ordered_dict.items(): if freq >= min_vocab_freq: @@ -388,12 +389,14 @@ def load_or_build_label(datasets, label_file=None, include_test_labels=False): def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_dir=None): - """If the word exists in the embedding file, load the pretrained word embedding. - Otherwise, assign a zero vector to that word. + """Obtain the word embeddings from file. If the word exists in the embedding file, + load the pretrained word embedding. Otherwise, assign a zero vector to that word. + If the given `embed_file` is the name of a pretrained GloVe embedding, the function + will first download the corresponding file. Args: word_dict (dict): A dictionary for mapping tokens to indices. - embed_file (str): Path to a file holding pre-trained embeddings. + embed_file (str): Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding. silent (bool, optional): Enable silent mode. Defaults to False. cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None. @@ -401,14 +404,14 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_d torch.Tensor: Embedding weights (vocab_size, embed_size). """ - if embed_file in PRETRAINED_ALIASES: - embed_file = _download_pretrained_embedding(embed_file, cache_dir=cache_dir) + if embed_file in GLOVE_WORD_EMBEDDING: + embed_file = _download_glove_embedding(embed_file, cache_dir=cache_dir) elif not os.path.isfile(embed_file): raise ValueError( - "Got embed_file {}, but allowed pretrained " "embeddings are {}".format(embed_file, PRETRAINED_ALIASES) + "Got embed_file {}, but allowed pretrained " "embeddings are {}".format(embed_file, GLOVE_WORD_EMBEDDING) ) - logging.info(f"Load pretrained embedding from file: {embed_file}.") + logging.info(f"Load pretrained embedding from {embed_file}.") with open(embed_file) as f: word_vectors = f.readlines() embed_size = len(word_vectors[0].split()) - 1 @@ -433,25 +436,30 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_d embedding_weights[word_dict[word]] = vector_dict[word] vec_counts += 1 - logging.info(f"loaded {vec_counts}/{len(word_dict)} word embeddings") + logging.info(f"Loaded {vec_counts}/{len(word_dict)} word embeddings") return embedding_weights -def _download_pretrained_embedding(embed_file, cache_dir=None): +def _download_glove_embedding(embed_name, cache_dir=None): """Download pretrained glove embedding from https://huggingface.co/stanfordnlp/glove/tree/main. + Args: + embed_name (str): The name of the pretrained GloVe embedding. Defaults to None. + cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None. + Returns: - str: Path to the cached or downloaded embedding file. + str: Path to the file that contains the cached embeddings. """ - cached_embed_file = f"{cache_dir}/{embed_file}.txt" + cache_dir = ".vector_cache" if cache_dir is None else cache_dir + cached_embed_file = f"{cache_dir}/{embed_name}.txt" if os.path.isfile(cached_embed_file): return cached_embed_file os.makedirs(cache_dir, exist_ok=True) - remote_embed_file = re.sub(r"6B.*", "6B", embed_file) + ".zip" + remote_embed_file = re.sub(r"6B.*", "6B", embed_name) + ".zip" url = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{remote_embed_file}" - logging.info(f"Downloading pretrained embedding from {url}.") + logging.info(f"Downloading pretrained embeddings from {url}.") try: zip_file, _ = urlretrieve(url, f"{cache_dir}/{remote_embed_file}") with zipfile.ZipFile(zip_file, "r") as zf: @@ -459,5 +467,5 @@ def _download_pretrained_embedding(embed_file, cache_dir=None): except Exception as e: os.remove(zip_file) raise e - + logging.info(f"Downloaded pretrained embeddings {embed_name} to {cached_embed_file}.") return cached_embed_file diff --git a/main.py b/main.py index d87fbdf9..3a1aa98c 100644 --- a/main.py +++ b/main.py @@ -141,7 +141,7 @@ def add_all_arguments(parser): # pretrained vocab / embeddings parser.add_argument("--vocab_file", type=str, help="Path to a file holding vocabuaries (default: %(default)s)") parser.add_argument( - "--embed_file", type=str, help="Path to a file holding pre-trained embeddings (default: %(default)s)" + "--embed_file", type=str, help="Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding (default: %(default)s)" ) parser.add_argument("--label_file", type=str, help="Path to a file holding all labels (default: %(default)s)") @@ -189,7 +189,6 @@ def add_all_arguments(parser): parser.add_argument( "--embed_cache_dir", type=str, - default=".vector_cache", help="For parameter search only: path to a directory for storing embeddings for multiple runs. (default: %(default)s)", ) parser.add_argument( From eb7614e848962e2762e7ba6919fc000ba634c590 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Tue, 29 Jul 2025 19:01:13 +0800 Subject: [PATCH 8/9] bump version: 0.8.1 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 290089c6..778e0958 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = libmultilabel -version = 0.8.0 +version = 0.8.1 author = LibMultiLabel Team license = MIT License license_file = LICENSE From 084b63f66a27d3f9158f0974e29c34a2dd0e41d2 Mon Sep 17 00:00:00 2001 From: Eleven Liu Date: Tue, 29 Jul 2025 19:22:15 +0800 Subject: [PATCH 9/9] Apply black formatter --- libmultilabel/nn/data_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py index 1907ee53..950e9669 100644 --- a/libmultilabel/nn/data_utils.py +++ b/libmultilabel/nn/data_utils.py @@ -389,9 +389,9 @@ def load_or_build_label(datasets, label_file=None, include_test_labels=False): def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_dir=None): - """Obtain the word embeddings from file. If the word exists in the embedding file, + """Obtain the word embeddings from file. If the word exists in the embedding file, load the pretrained word embedding. Otherwise, assign a zero vector to that word. - If the given `embed_file` is the name of a pretrained GloVe embedding, the function + If the given `embed_file` is the name of a pretrained GloVe embedding, the function will first download the corresponding file. Args: