diff --git a/README.md b/README.md index 33240b65..5504daaa 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,8 @@ This is an on-going development so many improvements are still being made. Comme ## Environments - Python: 3.10+ -- CUDA: 11.8, 12.1 (if training neural networks by GPU) -- Pytorch: 2.0.1+ +- CUDA: 11.8, 12.1, 12.6 (if training neural networks by GPU) +- Pytorch: 2.3.0+ If you have a different version of CUDA, follow the installation instructions for PyTorch LTS at their [website](https://pytorch.org/). diff --git a/docs/cli/nn.rst b/docs/cli/nn.rst index efe2f5f2..102231fa 100644 --- a/docs/cli/nn.rst +++ b/docs/cli/nn.rst @@ -77,7 +77,7 @@ If a model was trained before by this package, the training procedure can start To use your own word embeddings or vocabulary set, specify the following parameters: -- **embed_file**: choose one of the pretrained embeddings defined in `torchtext `_ or specify the path to your word embeddings with each line containing a word followed by its vectors. Example: +- **embed_file**: choose one of the pretrained embeddings: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, `glove.840B.300d`, or specify the path to your word embeddings with each line containing a word followed by its vectors. Example: .. code-block:: diff --git a/docs/examples/plot_KimCNN_quickstart.py b/docs/examples/plot_KimCNN_quickstart.py index 39efd6ba..49ae1f0d 100644 --- a/docs/examples/plot_KimCNN_quickstart.py +++ b/docs/examples/plot_KimCNN_quickstart.py @@ -32,7 +32,7 @@ # To run KimCNN, LibMultiLabel tokenizes documents and uses an embedding vector for each word. # Thus, ``tokenize_text=True`` is set. # -# We choose ``glove.6B.300d`` from torchtext as embedding vectors. +# We choose ``glove.6B.300d`` as embedding vectors. datasets = load_datasets("data/rcv1/train.txt", "data/rcv1/test.txt", tokenize_text=True) classes = load_or_build_label(datasets) diff --git a/libmultilabel/nn/attentionxml.py b/libmultilabel/nn/attentionxml.py index b54776ac..747f1b05 100644 --- a/libmultilabel/nn/attentionxml.py +++ b/libmultilabel/nn/attentionxml.py @@ -489,7 +489,7 @@ def reformat_text(self, dataset): # Convert words to numbers according to their indices in word_dict. Then pad each instance to a certain length. encoded_text = list( map( - lambda text: torch.tensor([self.word_dict[word] for word in text], dtype=torch.int64) + lambda text: torch.tensor([self.word_dict.get(word, self.word_dict[UNK]) for word in text], dtype=torch.int64) if text else torch.tensor([self.word_dict[UNK]], dtype=torch.int64), [instance["text"][: self.max_seq_length] for instance in dataset], diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py index 1d48bb06..950e9669 100644 --- a/libmultilabel/nn/data_utils.py +++ b/libmultilabel/nn/data_utils.py @@ -1,7 +1,12 @@ import csv import gc import logging +import os +import re import warnings +import zipfile +from urllib.request import urlretrieve +from collections import Counter, OrderedDict import pandas as pd import torch @@ -11,7 +16,6 @@ from sklearn.preprocessing import MultiLabelBinarizer from torch.nn.utils.rnn import pad_sequence from torch.utils.data import Dataset -from torchtext.vocab import build_vocab_from_iterator, pretrained_aliases, Vocab from tqdm import tqdm transformers.logging.set_verbosity_error() @@ -19,6 +23,14 @@ UNK = "" PAD = "" +GLOVE_WORD_EMBEDDING = { + "glove.42B.300d", + "glove.840B.300d", + "glove.6B.50d", + "glove.6B.100d", + "glove.6B.200d", + "glove.6B.300d", +} class TextDataset(Dataset): @@ -31,8 +43,7 @@ class TextDataset(Dataset): add_special_tokens (bool, optional): Whether to add the special tokens. Defaults to True. tokenizer (transformers.PreTrainedTokenizerBase, optional): HuggingFace's tokenizer of the transformer-based pretrained language model. Defaults to None. - word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to - map tokens to indices. Defaults to None. + word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None. """ def __init__( @@ -55,7 +66,7 @@ def __init__( self.num_classes = len(self.classes) self.label_binarizer = MultiLabelBinarizer().fit([classes]) - if not isinstance(self.word_dict, Vocab) ^ isinstance(self.tokenizer, transformers.PreTrainedTokenizerBase): + if not isinstance(self.word_dict, dict) ^ isinstance(self.tokenizer, transformers.PreTrainedTokenizerBase): raise ValueError("Please specify exactly one of word_dict or tokenizer") def __len__(self): @@ -71,7 +82,7 @@ def __getitem__(self, index): else: input_ids = self.tokenizer.encode(data["text"], add_special_tokens=False) else: - input_ids = [self.word_dict[word] for word in data["text"]] + input_ids = [self.word_dict.get(word, self.word_dict[UNK]) for word in data["text"]] return { "text": torch.LongTensor(input_ids[: self.max_seq_length]), "label": torch.IntTensor(self.label_binarizer.transform([data["label"]])[0]), @@ -128,8 +139,7 @@ def get_dataset_loader( add_special_tokens (bool, optional): Whether to add the special tokens. Defaults to True. tokenizer (transformers.PreTrainedTokenizerBase, optional): HuggingFace's tokenizer of the transformer-based pretrained language model. Defaults to None. - word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to - map tokens to indices. Defaults to None. + word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None. Returns: torch.utils.data.DataLoader: A pytorch DataLoader. @@ -154,6 +164,7 @@ def _load_raw_data(data, is_test=False, tokenize_text=True, remove_no_label_data Args: data (Union[str, pandas,.Dataframe]): Training, test, or validation data in file or dataframe. is_test (bool, optional): Whether the data is for test or not. Defaults to False. + tokenize_text (bool, optional): Whether to tokenize text. Defaults to True. remove_no_label_data (bool, optional): Whether to remove training/validation instances that have no labels. This is effective only when is_test=False. Defaults to False. @@ -265,20 +276,19 @@ def load_or_build_text_dict( ): """Build or load the vocabulary from the training dataset or the predefined `vocab_file`. The pretrained embedding can be either from a self-defined `embed_file` or from one of - the vectors defined in torchtext.vocab.pretrained_aliases - (https://github.com/pytorch/text/blob/main/torchtext/vocab/vectors.py). + the vectors: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, or `glove.840B.300d`. Args: dataset (list): List of training instances with index, label, and tokenized text. vocab_file (str, optional): Path to a file holding vocabuaries. Defaults to None. min_vocab_freq (int, optional): The minimum frequency needed to include a token in the vocabulary. Defaults to 1. - embed_file (str): Path to a file holding pre-trained embeddings. + embed_file (str): Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding. Defaults to None. embed_cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None. silent (bool, optional): Enable silent mode. Defaults to False. normalize_embed (bool, optional): Whether the embeddings of each word is normalized to a unit vector. Defaults to False. Returns: - tuple[torchtext.vocab.Vocab, torch.Tensor]: A vocab object which maps tokens to indices and the pre-trained word vectors of shape (vocab_size, embed_dim). + tuple[dict, torch.Tensor]: A dictionary which maps tokens to indices and the pre-trained word vectors of shape (vocab_size, embed_dim). """ if vocab_file: logging.info(f"Load vocab from {vocab_file}") @@ -286,14 +296,14 @@ def load_or_build_text_dict( vocab_list = [[vocab.strip() for vocab in fp.readlines()]] # Keep PAD index 0 to align `padding_idx` of # class Embedding in libmultilabel.nn.networks.modules. - vocabs = build_vocab_from_iterator(vocab_list, min_freq=1, specials=[PAD, UNK]) + word_dict = _build_word_dict(vocab_list, min_vocab_freq=1, specials=[PAD, UNK]) else: vocab_list = [set(data["text"]) for data in dataset] - vocabs = build_vocab_from_iterator(vocab_list, min_freq=min_vocab_freq, specials=[PAD, UNK]) - vocabs.set_default_index(vocabs[UNK]) - logging.info(f"Read {len(vocabs)} vocabularies.") + word_dict = _build_word_dict(vocab_list, min_vocab_freq=min_vocab_freq, specials=[PAD, UNK]) + + logging.info(f"Read {len(word_dict)} vocabularies.") - embedding_weights = get_embedding_weights_from_file(vocabs, embed_file, silent, embed_cache_dir) + embedding_weights = get_embedding_weights_from_file(word_dict, embed_file, silent, embed_cache_dir) if normalize_embed: # To have better precision for calculating the normalization, we convert the original @@ -306,7 +316,41 @@ def load_or_build_text_dict( embedding_weights[i] = vector / float(torch.linalg.norm(vector) + 1e-6) embedding_weights = embedding_weights.float() - return vocabs, embedding_weights + return word_dict, embedding_weights + + +def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None): + r"""Build word dictionary, modified from `torchtext.vocab.build-vocab-from-iterator` + (https://docs.pytorch.org/text/stable/vocab.html#build-vocab-from-iterator) + + Args: + vocab_list: List of words. + min_vocab_freq (int, optional): The minimum frequency needed to include a token in the vocabulary. Defaults to 1. + specials: Special tokens (e.g., , ) to add. Defaults to None. + + Returns: + dict: A dictionary which maps tokens to indices. + """ + + counter = Counter() + for tokens in vocab_list: + counter.update(tokens) + + # sort by descending frequency, then lexicographically + sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0])) + ordered_dict = OrderedDict(sorted_by_freq_tuples) + + # add special tokens at the beginning + tokens = specials or [] + for token, freq in ordered_dict.items(): + if freq >= min_vocab_freq: + tokens.append(token) + + # build token to indices dict + word_dict = dict() + for idx, token in enumerate(tokens): + word_dict[token] = idx + return word_dict def load_or_build_label(datasets, label_file=None, include_test_labels=False): @@ -344,70 +388,84 @@ def load_or_build_label(datasets, label_file=None, include_test_labels=False): return classes -def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=None): - """If the word exists in the embedding file, load the pretrained word embedding. - Otherwise, assign a zero vector to that word. +def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_dir=None): + """Obtain the word embeddings from file. If the word exists in the embedding file, + load the pretrained word embedding. Otherwise, assign a zero vector to that word. + If the given `embed_file` is the name of a pretrained GloVe embedding, the function + will first download the corresponding file. Args: - word_dict (torchtext.vocab.Vocab): A vocab object which maps tokens to indices. - embed_file (str): Path to a file holding pre-trained embeddings. + word_dict (dict): A dictionary for mapping tokens to indices. + embed_file (str): Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding. silent (bool, optional): Enable silent mode. Defaults to False. - cache (str, optional): Path to a directory for storing cached embeddings. Defaults to None. + cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None. Returns: torch.Tensor: Embedding weights (vocab_size, embed_size). """ - # Load pretrained word embedding - load_embedding_from_file = embed_file not in pretrained_aliases - if load_embedding_from_file: - logging.info(f"Load pretrained embedding from file: {embed_file}.") - with open(embed_file) as f: - word_vectors = f.readlines() - embed_size = len(word_vectors[0].split()) - 1 - vector_dict = {} - for word_vector in tqdm(word_vectors, disable=silent): - word, vector = word_vector.rstrip().split(" ", 1) - vector = torch.Tensor(list(map(float, vector.split()))) - vector_dict[word] = vector - else: - logging.info(f"Load pretrained embedding from torchtext.") - # Adapted from https://pytorch.org/text/0.9.0/_modules/torchtext/vocab.html#Vocab.load_vectors. - if embed_file not in pretrained_aliases: - raise ValueError( - "Got embed_file {}, but allowed pretrained " - "vectors are {}".format(embed_file, list(pretrained_aliases.keys())) - ) - - # Hotfix: Glove URLs are outdated in Torchtext - # (https://github.com/pytorch/text/blob/main/torchtext/vocab/vectors.py#L213-L217) - pretrained_cls = pretrained_aliases[embed_file] - if embed_file.startswith("glove"): - for name, url in pretrained_cls.func.url.items(): - file_name = url.split("/")[-1] - pretrained_cls.func.url[name] = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{file_name}" - - vector_dict = pretrained_cls(cache=cache) - embed_size = vector_dict.dim - embedding_weights = torch.zeros(len(word_dict), embed_size) + if embed_file in GLOVE_WORD_EMBEDDING: + embed_file = _download_glove_embedding(embed_file, cache_dir=cache_dir) + elif not os.path.isfile(embed_file): + raise ValueError( + "Got embed_file {}, but allowed pretrained " "embeddings are {}".format(embed_file, GLOVE_WORD_EMBEDDING) + ) + + logging.info(f"Load pretrained embedding from {embed_file}.") + with open(embed_file) as f: + word_vectors = f.readlines() + embed_size = len(word_vectors[0].split()) - 1 - if load_embedding_from_file: - # Add UNK embedding - # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size) - # CAML: np.random.randn(embed_size) - unk_vector = torch.randn(embed_size) - embedding_weights[word_dict[UNK]] = unk_vector + vector_dict = {} + for word_vector in tqdm(word_vectors, disable=silent): + word, vector = word_vector.rstrip().split(" ", 1) + vector = torch.Tensor(list(map(float, vector.split()))) + vector_dict[word] = vector + + embedding_weights = torch.zeros(len(word_dict), embed_size) + # Add UNK embedding + # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size) + # CAML: np.random.randn(embed_size) + unk_vector = torch.randn(embed_size) + embedding_weights[word_dict[UNK]] = unk_vector # Store pretrained word embedding vec_counts = 0 - for word in word_dict.get_itos(): - # The condition can be used to process the word that does not in the embedding file. - # Note that torchtext vector object has already dealt with this, - # so we can directly make a query without addtional handling. - if (load_embedding_from_file and word in vector_dict) or not load_embedding_from_file: + for word in word_dict.keys(): + if word in vector_dict: embedding_weights[word_dict[word]] = vector_dict[word] vec_counts += 1 - logging.info(f"loaded {vec_counts}/{len(word_dict)} word embeddings") + logging.info(f"Loaded {vec_counts}/{len(word_dict)} word embeddings") return embedding_weights + + +def _download_glove_embedding(embed_name, cache_dir=None): + """Download pretrained glove embedding from https://huggingface.co/stanfordnlp/glove/tree/main. + + Args: + embed_name (str): The name of the pretrained GloVe embedding. Defaults to None. + cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None. + + Returns: + str: Path to the file that contains the cached embeddings. + """ + cache_dir = ".vector_cache" if cache_dir is None else cache_dir + cached_embed_file = f"{cache_dir}/{embed_name}.txt" + if os.path.isfile(cached_embed_file): + return cached_embed_file + os.makedirs(cache_dir, exist_ok=True) + + remote_embed_file = re.sub(r"6B.*", "6B", embed_name) + ".zip" + url = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{remote_embed_file}" + logging.info(f"Downloading pretrained embeddings from {url}.") + try: + zip_file, _ = urlretrieve(url, f"{cache_dir}/{remote_embed_file}") + with zipfile.ZipFile(zip_file, "r") as zf: + zf.extractall(cache_dir) + except Exception as e: + os.remove(zip_file) + raise e + logging.info(f"Downloaded pretrained embeddings {embed_name} to {cached_embed_file}.") + return cached_embed_file diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py index a1dcf070..f7f76439 100644 --- a/libmultilabel/nn/model.py +++ b/libmultilabel/nn/model.py @@ -181,7 +181,7 @@ class Model(MultiLabelModel): Args: classes (list): List of class names. - word_dict (torchtext.vocab.Vocab): A vocab object which maps tokens to indices. + word_dict (dict): A dictionary for mapping tokens to indices. network (nn.Module): Network (i.e., CAML, KimCNN, or XMLCNN). loss_function (str, optional): Loss function name (i.e., binary_cross_entropy_with_logits, cross_entropy). Defaults to 'binary_cross_entropy_with_logits'. diff --git a/libmultilabel/nn/nn_utils.py b/libmultilabel/nn/nn_utils.py index a4ac82c2..f9107d01 100644 --- a/libmultilabel/nn/nn_utils.py +++ b/libmultilabel/nn/nn_utils.py @@ -61,8 +61,7 @@ def init_model( model_name (str): Model to be used such as KimCNN. network_config (dict): Configuration for defining the network. classes (list): List of class names. - word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to - map tokens to indices. Defaults to None. + word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None. embed_vecs (torch.Tensor, optional): The pre-trained word vectors of shape (vocab_size, embed_dim). Defaults to None. init_weight (str): Weight initialization method from `torch.nn.init`. diff --git a/main.py b/main.py index 12564f6b..3a1aa98c 100644 --- a/main.py +++ b/main.py @@ -141,7 +141,7 @@ def add_all_arguments(parser): # pretrained vocab / embeddings parser.add_argument("--vocab_file", type=str, help="Path to a file holding vocabuaries (default: %(default)s)") parser.add_argument( - "--embed_file", type=str, help="Path to a file holding pre-trained embeddings (default: %(default)s)" + "--embed_file", type=str, help="Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding (default: %(default)s)" ) parser.add_argument("--label_file", type=str, help="Path to a file holding all labels (default: %(default)s)") diff --git a/requirements_nn.txt b/requirements_nn.txt index 2c0da6a4..edd34545 100644 --- a/requirements_nn.txt +++ b/requirements_nn.txt @@ -1,8 +1,6 @@ nltk lightning # https://github.com/pytorch/text/releases -torch<=2.3 +torch torchmetrics==0.10.3 -torchtext -# https://github.com/huggingface/transformers/issues/38464 -transformers<=4.51.3 +transformers diff --git a/search_params.py b/search_params.py index aad38ece..c4d8b9db 100644 --- a/search_params.py +++ b/search_params.py @@ -25,8 +25,8 @@ def train_libmultilabel_tune(config, datasets, classes, word_dict): Args: config (dict): Config of the experiment. datasets (dict): A dictionary of datasets. - classes(list): List of class names. - word_dict(torchtext.vocab.Vocab): A vocab object which maps tokens to indices. + classes (list): List of class names. + word_dict (dict): A dictionary for mapping tokens to indices. """ # ray convert AttributeDict to dict diff --git a/setup.cfg b/setup.cfg index a676ce91..778e0958 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = libmultilabel -version = 0.8.0 +version = 0.8.1 author = LibMultiLabel Team license = MIT License license_file = LICENSE @@ -12,7 +12,7 @@ project_urls = Documentation = https://www.csie.ntu.edu.tw/~cjlin/libmultilabel Source Code = https://github.com/ASUS-AICS/LibMultiLabel/ classifiers = - Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.1 + Environment :: GPU :: NVIDIA CUDA :: 12 Environment :: GPU :: NVIDIA CUDA :: 11.8 Intended Audience :: Developers Intended Audience :: Education @@ -21,6 +21,9 @@ classifiers = Operating System :: OS Independent Programming Language :: Python :: 3 Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 + Programming Language :: Python :: 3.13 [options] packages = find: @@ -40,10 +43,9 @@ python_requires = >=3.10 nn = lightning nltk - torch<=2.3 + torch torchmetrics==0.10.3 - torchtext - transformers<=4.51.3 + transformers [options.packages.find] diff --git a/tests/nn/components.py b/tests/nn/components.py index b74a1c15..bcfbcd68 100644 --- a/tests/nn/components.py +++ b/tests/nn/components.py @@ -20,7 +20,7 @@ def get_name(self): return "token_to_id" def get_from_trainer(self, trainer): - return trainer.model.word_dict.get_stoi() + return trainer.model.word_dict def compare(self, a, b): return a == b diff --git a/torch_trainer.py b/torch_trainer.py index 8dc259b5..a7f0641d 100644 --- a/torch_trainer.py +++ b/torch_trainer.py @@ -19,7 +19,7 @@ class TorchTrainer: config (AttributeDict): Config of the experiment. datasets (dict, optional): Datasets for training, validation, and test. Defaults to None. classes(list, optional): List of class names. - word_dict(torchtext.vocab.Vocab, optional): A vocab object which maps tokens to indices. + word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None. embed_vecs (torch.Tensor, optional): The pre-trained word vectors of shape (vocab_size, embed_dim). save_checkpoints (bool, optional): Whether to save the last and the best checkpoint or not. Defaults to True. @@ -136,7 +136,7 @@ def _setup_model( Args: classes(list): List of class names. - word_dict(torchtext.vocab.Vocab): A vocab object which maps tokens to indices. + word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None. embed_vecs (torch.Tensor): The pre-trained word vectors of shape (vocab_size, embed_dim). log_path (str): Path to the log file. The log file contains the validation results for each epoch and the test results. If the `log_path` is None, no performance