From bffa43f9b9c08a506beece00c525a7c48bd217d6 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Tue, 22 Jul 2025 16:55:39 +0800
Subject: [PATCH 1/9] Draft first runnable version: remove torchtext from
 nn/data_utils.py

---
 libmultilabel/nn/data_utils.py | 176 +++++++++++++++++++++------------
 main.py                        |   1 +
 2 files changed, 115 insertions(+), 62 deletions(-)
diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py
index 1d48bb06..b6e104f9 100644
--- a/libmultilabel/nn/data_utils.py
+++ b/libmultilabel/nn/data_utils.py
@@ -1,7 +1,12 @@
 import csv
 import gc
 import logging
+import os
+import re
 import warnings
+import zipfile
+from urllib.request import urlretrieve
+from collections import Counter, OrderedDict
 
 import pandas as pd
 import torch
@@ -11,7 +16,6 @@
 from sklearn.preprocessing import MultiLabelBinarizer
 from torch.nn.utils.rnn import pad_sequence
 from torch.utils.data import Dataset
-from torchtext.vocab import build_vocab_from_iterator, pretrained_aliases, Vocab
 from tqdm import tqdm
 
 transformers.logging.set_verbosity_error()
@@ -19,6 +23,14 @@
 
 UNK = "<unk>"
 PAD = "<pad>"
+PRETRAINED_ALIASES = {
+    "glove.42B.300d",
+    "glove.840B.300d",
+    "glove.6B.50d",
+    "glove.6B.100d",
+    "glove.6B.200d",
+    "glove.6B.300d",
+}
 
 
 class TextDataset(Dataset):
@@ -31,8 +43,7 @@ class TextDataset(Dataset):
         add_special_tokens (bool, optional): Whether to add the special tokens. Defaults to True.
         tokenizer (transformers.PreTrainedTokenizerBase, optional): HuggingFace's tokenizer of
             the transformer-based pretrained language model. Defaults to None.
-        word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to
-            map tokens to indices. Defaults to None.
+        word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None.
     """
 
     def __init__(
@@ -55,7 +66,7 @@ def __init__(
         self.num_classes = len(self.classes)
         self.label_binarizer = MultiLabelBinarizer().fit([classes])
 
-        if not isinstance(self.word_dict, Vocab) ^ isinstance(self.tokenizer, transformers.PreTrainedTokenizerBase):
+        if not isinstance(self.word_dict, dict) ^ isinstance(self.tokenizer, transformers.PreTrainedTokenizerBase):
             raise ValueError("Please specify exactly one of word_dict or tokenizer")
 
     def __len__(self):
@@ -71,7 +82,7 @@ def __getitem__(self, index):
             else:
                 input_ids = self.tokenizer.encode(data["text"], add_special_tokens=False)
         else:
-            input_ids = [self.word_dict[word] for word in data["text"]]
+            input_ids = [self.word_dict.get(word, self.word_dict[UNK]) for word in data["text"]]
         return {
             "text": torch.LongTensor(input_ids[: self.max_seq_length]),
             "label": torch.IntTensor(self.label_binarizer.transform([data["label"]])[0]),
@@ -128,8 +139,7 @@ def get_dataset_loader(
         add_special_tokens (bool, optional): Whether to add the special tokens. Defaults to True.
         tokenizer (transformers.PreTrainedTokenizerBase, optional): HuggingFace's tokenizer of
             the transformer-based pretrained language model. Defaults to None.
-        word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to
-            map tokens to indices. Defaults to None.
+        word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None.
 
     Returns:
         torch.utils.data.DataLoader: A pytorch DataLoader.
@@ -276,9 +286,9 @@ def load_or_build_text_dict(
         embed_cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
         silent (bool, optional): Enable silent mode. Defaults to False.
         normalize_embed (bool, optional): Whether the embeddings of each word is normalized to a unit vector. Defaults to False.
-
+    
     Returns:
-        tuple[torchtext.vocab.Vocab, torch.Tensor]: A vocab object which maps tokens to indices and the pre-trained word vectors of shape (vocab_size, embed_dim).
+        tuple[dict, torch.Tensor]: A dictionary which maps tokens to indices and the pre-trained word vectors of shape (vocab_size, embed_dim).
     """
     if vocab_file:
         logging.info(f"Load vocab from {vocab_file}")
@@ -286,14 +296,14 @@ def load_or_build_text_dict(
             vocab_list = [[vocab.strip() for vocab in fp.readlines()]]
         # Keep PAD index 0 to align `padding_idx` of
         # class Embedding in libmultilabel.nn.networks.modules.
-        vocabs = build_vocab_from_iterator(vocab_list, min_freq=1, specials=[PAD, UNK])
+        word_dict = _build_word_dict(vocab_list, min_vocab_freq=1, specials=[PAD, UNK])
     else:
         vocab_list = [set(data["text"]) for data in dataset]
-        vocabs = build_vocab_from_iterator(vocab_list, min_freq=min_vocab_freq, specials=[PAD, UNK])
-    vocabs.set_default_index(vocabs[UNK])
-    logging.info(f"Read {len(vocabs)} vocabularies.")
+        word_dict = _build_word_dict(vocab_list, min_vocab_freq=min_vocab_freq, specials=[PAD, UNK]) # we don't need min_vocab_freq as we use set
+
+    logging.info(f"Read {len(word_dict)} vocabularies.") # TBD: check if pad unk is included
 
-    embedding_weights = get_embedding_weights_from_file(vocabs, embed_file, silent, embed_cache_dir)
+    embedding_weights = get_embedding_weights_from_file(word_dict, embed_file, silent, embed_cache_dir)
 
     if normalize_embed:
         # To have better precision for calculating the normalization, we convert the original
@@ -306,7 +316,40 @@ def load_or_build_text_dict(
             embedding_weights[i] = vector / float(torch.linalg.norm(vector) + 1e-6)
         embedding_weights = embedding_weights.float()
 
-    return vocabs, embedding_weights
+    return word_dict, embedding_weights
+
+
+def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None):
+    r"""Build word dictionary, modified from torchtext.vocab.build-vocab-from-iterator (https://docs.pytorch.org/text/stable/vocab.html#build-vocab-from-iterator)
+
+    Args:
+        vocab_list: List of words.
+        min_vocab_freq (int, optional): The minimum frequency needed to include a token in the vocabulary. Defaults to 1.
+        specials: Special tokens (e.g., <unk>, <pad>) to add.
+
+    Returns:
+        dict: A dictionary which maps tokens to indices.
+    """
+
+    counter = Counter()
+    for tokens in vocab_list:
+        counter.update(tokens)
+    
+    # sort by descending frequency, then lexicographically
+    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
+    ordered_dict = OrderedDict(sorted_by_freq_tuples)
+
+    # add special tokens at the beginning
+    tokens = specials or []
+    for token, freq in ordered_dict.items():
+        if freq >= min_vocab_freq:
+            tokens.append(token)
+
+    # build token to indices dict
+    word_dict = dict()
+    for idx, token in enumerate(tokens):
+        word_dict[token] = idx
+    return word_dict
 
 
 def load_or_build_label(datasets, label_file=None, include_test_labels=False):
@@ -344,70 +387,79 @@ def load_or_build_label(datasets, label_file=None, include_test_labels=False):
     return classes
 
 
-def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=None):
+def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_dir=None):
     """If the word exists in the embedding file, load the pretrained word embedding.
     Otherwise, assign a zero vector to that word.
 
     Args:
-        word_dict (torchtext.vocab.Vocab): A vocab object which maps tokens to indices.
+        word_dict (dict): A vocab object which maps tokens to indices.
         embed_file (str): Path to a file holding pre-trained embeddings.
         silent (bool, optional): Enable silent mode. Defaults to False.
-        cache (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
+        cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
 
     Returns:
         torch.Tensor: Embedding weights (vocab_size, embed_size).
     """
-    # Load pretrained word embedding
-    load_embedding_from_file = embed_file not in pretrained_aliases
-    if load_embedding_from_file:
-        logging.info(f"Load pretrained embedding from file: {embed_file}.")
-        with open(embed_file) as f:
-            word_vectors = f.readlines()
-        embed_size = len(word_vectors[0].split()) - 1
-        vector_dict = {}
-        for word_vector in tqdm(word_vectors, disable=silent):
-            word, vector = word_vector.rstrip().split(" ", 1)
-            vector = torch.Tensor(list(map(float, vector.split())))
-            vector_dict[word] = vector
-    else:
-        logging.info(f"Load pretrained embedding from torchtext.")
-        # Adapted from https://pytorch.org/text/0.9.0/_modules/torchtext/vocab.html#Vocab.load_vectors.
-        if embed_file not in pretrained_aliases:
-            raise ValueError(
-                "Got embed_file {}, but allowed pretrained "
-                "vectors are {}".format(embed_file, list(pretrained_aliases.keys()))
-            )
-
-        # Hotfix: Glove URLs are outdated in Torchtext
-        # (https://github.com/pytorch/text/blob/main/torchtext/vocab/vectors.py#L213-L217)
-        pretrained_cls = pretrained_aliases[embed_file]
-        if embed_file.startswith("glove"):
-            for name, url in pretrained_cls.func.url.items():
-                file_name = url.split("/")[-1]
-                pretrained_cls.func.url[name] = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{file_name}"
-
-        vector_dict = pretrained_cls(cache=cache)
-        embed_size = vector_dict.dim
 
-    embedding_weights = torch.zeros(len(word_dict), embed_size)
+    if embed_file in PRETRAINED_ALIASES:
+        embed_file = _download_pretrained_embedding(embed_file, cache_dir=cache_dir)
+    elif not os.path.isfile(embed_file):
+        raise ValueError(
+            "Got embed_file {}, but allowed pretrained "
+            "embeddings are {}".format(embed_file, PRETRAINED_ALIASES)
+        )
+
+    logging.info(f"Load pretrained embedding from file: {embed_file}.")
+    with open(embed_file) as f:
+        word_vectors = f.readlines()
+    embed_size = len(word_vectors[0].split()) - 1
+    
+    vector_dict = {}
+    for word_vector in tqdm(word_vectors, disable=silent):
+        word, vector = word_vector.rstrip().split(" ", 1)
+        vector = torch.Tensor(list(map(float, vector.split())))
+        vector_dict[word] = vector
 
-    if load_embedding_from_file:
-        # Add UNK embedding
-        # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size)
-        # CAML: np.random.randn(embed_size)
-        unk_vector = torch.randn(embed_size)
-        embedding_weights[word_dict[UNK]] = unk_vector
+    embedding_weights = torch.zeros(len(word_dict), embed_size)
+    # Add UNK embedding
+    #   AttentionXML: np.random.uniform(-1.0, 1.0, embed_size)
+    #   CAML: np.random.randn(embed_size)
+    unk_vector = torch.randn(embed_size)
+    embedding_weights[word_dict[UNK]] = unk_vector
 
     # Store pretrained word embedding
     vec_counts = 0
-    for word in word_dict.get_itos():
-        # The condition can be used to process the word that does not in the embedding file.
-        # Note that torchtext vector object has already dealt with this,
-        # so we can directly make a query without addtional handling.
-        if (load_embedding_from_file and word in vector_dict) or not load_embedding_from_file:
+    # for word in word_dict.get_itos(): # list of words
+    for word in word_dict.keys():
+        if word in vector_dict:
             embedding_weights[word_dict[word]] = vector_dict[word]
-            vec_counts += 1
+        vec_counts += 1
 
     logging.info(f"loaded {vec_counts}/{len(word_dict)} word embeddings")
 
     return embedding_weights
+
+
+def _download_pretrained_embedding(embed_file, cache_dir=None):
+    """Download pretrained glove embedding from https://huggingface.co/stanfordnlp/glove/tree/main.
+
+    Returns:
+        str: Path to the cached or downloaded embedding file.
+    """
+    cached_embed_file = f"{cache_dir}/{embed_file}.txt"
+    if os.path.isfile(cached_embed_file):
+            return cached_embed_file
+    os.makedirs(cache_dir, exist_ok=True)
+    
+    remote_embed_file = re.sub(r"6B.*", "6B", embed_file) + ".zip"
+    url = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{remote_embed_file}"
+    logging.info(f"Downloading pretrained embedding from {url}.")
+    try:
+        zip_file, _ = urlretrieve(url, f"{cache_dir}/{remote_embed_file}")
+        with zipfile.ZipFile(zip_file, "r") as zf:
+            zf.extractall(cache_dir)
+    except Exception as e:
+        os.remove(zip_file)
+        raise e
+
+    return cached_embed_file
diff --git a/main.py b/main.py
index 12564f6b..d87fbdf9 100644
--- a/main.py
+++ b/main.py
@@ -189,6 +189,7 @@ def add_all_arguments(parser):
     parser.add_argument(
         "--embed_cache_dir",
         type=str,
+        default=".vector_cache",
         help="For parameter search only: path to a directory for storing embeddings for multiple runs. (default: %(default)s)",
     )
     parser.add_argument(

From bb2261db5b5aab21eb8439fa121f57df57dc0258 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Thu, 24 Jul 2025 09:27:34 +0800
Subject: [PATCH 2/9] Update comments related to torchtext.

---
 docs/cli/nn.rst                         |  3 ++-
 docs/examples/plot_KimCNN_quickstart.py |  2 +-
 libmultilabel/nn/data_utils.py          | 23 ++++++++++-------------
 libmultilabel/nn/model.py               |  2 +-
 libmultilabel/nn/nn_utils.py            |  3 +--
 search_params.py                        |  4 ++--
 tests/nn/components.py                  |  2 +-
 torch_trainer.py                        |  4 ++--
 8 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/docs/cli/nn.rst b/docs/cli/nn.rst
index efe2f5f2..7391138d 100644
--- a/docs/cli/nn.rst
+++ b/docs/cli/nn.rst
@@ -77,7 +77,8 @@ If a model was trained before by this package, the training procedure can start
 
 To use your own word embeddings or vocabulary set, specify the following parameters:
 
-- **embed_file**: choose one of the pretrained embeddings defined in `torchtext <https://pytorch.org/text/0.9.0/vocab.html#torchtext.vocab.Vocab.load_vectors>`_ or specify the path to your word embeddings with each line containing a word followed by its vectors. Example:
+- **embed_file**: choose one of the pretrained embeddings: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, 
+`glove.42B.300d`, `glove.840B.300d`, or specify the path to your word embeddings with each line containing a word followed by its vectors.
 
 .. code-block::
 
diff --git a/docs/examples/plot_KimCNN_quickstart.py b/docs/examples/plot_KimCNN_quickstart.py
index 39efd6ba..49ae1f0d 100644
--- a/docs/examples/plot_KimCNN_quickstart.py
+++ b/docs/examples/plot_KimCNN_quickstart.py
@@ -32,7 +32,7 @@
 # To run KimCNN, LibMultiLabel tokenizes documents and uses an embedding vector for each word.
 # Thus, ``tokenize_text=True`` is set.
 #
-# We choose ``glove.6B.300d`` from torchtext as embedding vectors.
+# We choose ``glove.6B.300d`` as embedding vectors.
 
 datasets = load_datasets("data/rcv1/train.txt", "data/rcv1/test.txt", tokenize_text=True)
 classes = load_or_build_label(datasets)
diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py
index b6e104f9..6d9c3f95 100644
--- a/libmultilabel/nn/data_utils.py
+++ b/libmultilabel/nn/data_utils.py
@@ -275,8 +275,7 @@ def load_or_build_text_dict(
 ):
     """Build or load the vocabulary from the training dataset or the predefined `vocab_file`.
     The pretrained embedding can be either from a self-defined `embed_file` or from one of
-    the vectors defined in torchtext.vocab.pretrained_aliases
-    (https://github.com/pytorch/text/blob/main/torchtext/vocab/vectors.py).
+    the vectors: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, `glove.840B.300d`.
 
     Args:
         dataset (list): List of training instances with index, label, and tokenized text.
@@ -286,7 +285,7 @@ def load_or_build_text_dict(
         embed_cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
         silent (bool, optional): Enable silent mode. Defaults to False.
         normalize_embed (bool, optional): Whether the embeddings of each word is normalized to a unit vector. Defaults to False.
-    
+
     Returns:
         tuple[dict, torch.Tensor]: A dictionary which maps tokens to indices and the pre-trained word vectors of shape (vocab_size, embed_dim).
     """
@@ -299,9 +298,9 @@ def load_or_build_text_dict(
         word_dict = _build_word_dict(vocab_list, min_vocab_freq=1, specials=[PAD, UNK])
     else:
         vocab_list = [set(data["text"]) for data in dataset]
-        word_dict = _build_word_dict(vocab_list, min_vocab_freq=min_vocab_freq, specials=[PAD, UNK]) # we don't need min_vocab_freq as we use set
+        word_dict = _build_word_dict(vocab_list, min_vocab_freq=min_vocab_freq, specials=[PAD, UNK])
 
-    logging.info(f"Read {len(word_dict)} vocabularies.") # TBD: check if pad unk is included
+    logging.info(f"Read {len(word_dict)} vocabularies.")
 
     embedding_weights = get_embedding_weights_from_file(word_dict, embed_file, silent, embed_cache_dir)
 
@@ -334,7 +333,7 @@ def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None):
     counter = Counter()
     for tokens in vocab_list:
         counter.update(tokens)
-    
+
     # sort by descending frequency, then lexicographically
     sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
     ordered_dict = OrderedDict(sorted_by_freq_tuples)
@@ -392,7 +391,7 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_d
     Otherwise, assign a zero vector to that word.
 
     Args:
-        word_dict (dict): A vocab object which maps tokens to indices.
+        word_dict (dict): A dictionary for mapping tokens to indices.
         embed_file (str): Path to a file holding pre-trained embeddings.
         silent (bool, optional): Enable silent mode. Defaults to False.
         cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
@@ -405,15 +404,14 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_d
         embed_file = _download_pretrained_embedding(embed_file, cache_dir=cache_dir)
     elif not os.path.isfile(embed_file):
         raise ValueError(
-            "Got embed_file {}, but allowed pretrained "
-            "embeddings are {}".format(embed_file, PRETRAINED_ALIASES)
+            "Got embed_file {}, but allowed pretrained " "embeddings are {}".format(embed_file, PRETRAINED_ALIASES)
         )
 
     logging.info(f"Load pretrained embedding from file: {embed_file}.")
     with open(embed_file) as f:
         word_vectors = f.readlines()
     embed_size = len(word_vectors[0].split()) - 1
-    
+
     vector_dict = {}
     for word_vector in tqdm(word_vectors, disable=silent):
         word, vector = word_vector.rstrip().split(" ", 1)
@@ -429,7 +427,6 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_d
 
     # Store pretrained word embedding
     vec_counts = 0
-    # for word in word_dict.get_itos(): # list of words
     for word in word_dict.keys():
         if word in vector_dict:
             embedding_weights[word_dict[word]] = vector_dict[word]
@@ -448,9 +445,9 @@ def _download_pretrained_embedding(embed_file, cache_dir=None):
     """
     cached_embed_file = f"{cache_dir}/{embed_file}.txt"
     if os.path.isfile(cached_embed_file):
-            return cached_embed_file
+        return cached_embed_file
     os.makedirs(cache_dir, exist_ok=True)
-    
+
     remote_embed_file = re.sub(r"6B.*", "6B", embed_file) + ".zip"
     url = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{remote_embed_file}"
     logging.info(f"Downloading pretrained embedding from {url}.")
diff --git a/libmultilabel/nn/model.py b/libmultilabel/nn/model.py
index a1dcf070..f7f76439 100644
--- a/libmultilabel/nn/model.py
+++ b/libmultilabel/nn/model.py
@@ -181,7 +181,7 @@ class Model(MultiLabelModel):
 
     Args:
         classes (list): List of class names.
-        word_dict (torchtext.vocab.Vocab): A vocab object which maps tokens to indices.
+        word_dict (dict): A dictionary for mapping tokens to indices.
         network (nn.Module): Network (i.e., CAML, KimCNN, or XMLCNN).
         loss_function (str, optional): Loss function name (i.e., binary_cross_entropy_with_logits,
             cross_entropy). Defaults to 'binary_cross_entropy_with_logits'.
diff --git a/libmultilabel/nn/nn_utils.py b/libmultilabel/nn/nn_utils.py
index a4ac82c2..f9107d01 100644
--- a/libmultilabel/nn/nn_utils.py
+++ b/libmultilabel/nn/nn_utils.py
@@ -61,8 +61,7 @@ def init_model(
         model_name (str): Model to be used such as KimCNN.
         network_config (dict): Configuration for defining the network.
         classes (list): List of class names.
-        word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to
-            map tokens to indices. Defaults to None.
+        word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None.
         embed_vecs (torch.Tensor, optional): The pre-trained word vectors of shape
             (vocab_size, embed_dim). Defaults to None.
         init_weight (str): Weight initialization method from `torch.nn.init`.
diff --git a/search_params.py b/search_params.py
index aad38ece..c4d8b9db 100644
--- a/search_params.py
+++ b/search_params.py
@@ -25,8 +25,8 @@ def train_libmultilabel_tune(config, datasets, classes, word_dict):
     Args:
         config (dict): Config of the experiment.
         datasets (dict): A dictionary of datasets.
-        classes(list): List of class names.
-        word_dict(torchtext.vocab.Vocab): A vocab object which maps tokens to indices.
+        classes (list): List of class names.
+        word_dict (dict): A dictionary for mapping tokens to indices.
     """
 
     # ray convert AttributeDict to dict
diff --git a/tests/nn/components.py b/tests/nn/components.py
index b74a1c15..bcfbcd68 100644
--- a/tests/nn/components.py
+++ b/tests/nn/components.py
@@ -20,7 +20,7 @@ def get_name(self):
         return "token_to_id"
 
     def get_from_trainer(self, trainer):
-        return trainer.model.word_dict.get_stoi()
+        return trainer.model.word_dict
 
     def compare(self, a, b):
         return a == b
diff --git a/torch_trainer.py b/torch_trainer.py
index 8dc259b5..a7f0641d 100644
--- a/torch_trainer.py
+++ b/torch_trainer.py
@@ -19,7 +19,7 @@ class TorchTrainer:
         config (AttributeDict): Config of the experiment.
         datasets (dict, optional): Datasets for training, validation, and test. Defaults to None.
         classes(list, optional): List of class names.
-        word_dict(torchtext.vocab.Vocab, optional): A vocab object which maps tokens to indices.
+        word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None.
         embed_vecs (torch.Tensor, optional): The pre-trained word vectors of shape (vocab_size, embed_dim).
         save_checkpoints (bool, optional): Whether to save the last and the best checkpoint or not.
             Defaults to True.
@@ -136,7 +136,7 @@ def _setup_model(
 
         Args:
             classes(list): List of class names.
-            word_dict(torchtext.vocab.Vocab): A vocab object which maps tokens to indices.
+            word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None.
             embed_vecs (torch.Tensor): The pre-trained word vectors of shape (vocab_size, embed_dim).
             log_path (str): Path to the log file. The log file contains the validation
                 results for each epoch and the test results. If the `log_path` is None, no performance

From e19d93435b6fe0b56ab780663533c38a40ee789d Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Thu, 24 Jul 2025 10:57:59 +0800
Subject: [PATCH 3/9] restore indent

---
 libmultilabel/nn/data_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py
index 6d9c3f95..fdd56d09 100644
--- a/libmultilabel/nn/data_utils.py
+++ b/libmultilabel/nn/data_utils.py
@@ -430,7 +430,7 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_d
     for word in word_dict.keys():
         if word in vector_dict:
             embedding_weights[word_dict[word]] = vector_dict[word]
-        vec_counts += 1
+            vec_counts += 1
 
     logging.info(f"loaded {vec_counts}/{len(word_dict)} word embeddings")
 

From 39bdde8e16a5fdbd2ae88714a1836e04b6c59a8a Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Fri, 25 Jul 2025 17:28:45 +0800
Subject: [PATCH 4/9] (1) Update versions: torch, transformers (2) fix UI
 newline

---
 README.md           |  2 +-
 docs/cli/nn.rst     |  3 +--
 requirements_nn.txt |  6 ++----
 setup.cfg           | 10 ++++++----
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 33240b65..3e4e7d41 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ This is an on-going development so many improvements are still being made. Comme
 
 ## Environments
 - Python: 3.10+
-- CUDA: 11.8, 12.1 (if training neural networks by GPU)
+- CUDA: 11.8, 12.6 (if training neural networks by GPU)
 - Pytorch: 2.0.1+
 
 If you have a different version of CUDA, follow the installation instructions for PyTorch LTS at their [website](https://pytorch.org/).
diff --git a/docs/cli/nn.rst b/docs/cli/nn.rst
index 7391138d..41eb9a54 100644
--- a/docs/cli/nn.rst
+++ b/docs/cli/nn.rst
@@ -77,8 +77,7 @@ If a model was trained before by this package, the training procedure can start
 
 To use your own word embeddings or vocabulary set, specify the following parameters:
 
-- **embed_file**: choose one of the pretrained embeddings: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, 
-`glove.42B.300d`, `glove.840B.300d`, or specify the path to your word embeddings with each line containing a word followed by its vectors.
+- **embed_file**: choose one of the pretrained embeddings: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, `glove.840B.300d`, or specify the path to your word embeddings with each line containing a word followed by its vectors.
 
 .. code-block::
 
diff --git a/requirements_nn.txt b/requirements_nn.txt
index 2c0da6a4..edd34545 100644
--- a/requirements_nn.txt
+++ b/requirements_nn.txt
@@ -1,8 +1,6 @@
 nltk
 lightning
 # https://github.com/pytorch/text/releases
-torch<=2.3
+torch
 torchmetrics==0.10.3
-torchtext
-# https://github.com/huggingface/transformers/issues/38464
-transformers<=4.51.3
+transformers
diff --git a/setup.cfg b/setup.cfg
index a676ce91..c8a6cb3d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -12,7 +12,7 @@ project_urls =
     Documentation = https://www.csie.ntu.edu.tw/~cjlin/libmultilabel
     Source Code = https://github.com/ASUS-AICS/LibMultiLabel/
 classifiers =
-    Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.1
+    Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.6
     Environment :: GPU :: NVIDIA CUDA :: 11.8
     Intended Audience :: Developers
     Intended Audience :: Education
@@ -21,6 +21,9 @@ classifiers =
     Operating System :: OS Independent
     Programming Language :: Python :: 3
     Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.12
+    Programming Language :: Python :: 3.13
 
 [options]
 packages = find:
@@ -40,10 +43,9 @@ python_requires = >=3.10
 nn =
     lightning
     nltk
-    torch<=2.3
+    torch
     torchmetrics==0.10.3
-    torchtext
-    transformers<=4.51.3
+    transformers
 
 
 [options.packages.find]

From 2fd112cec830bfd499143cdc1bfef5f369bdbbfe Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Sat, 26 Jul 2025 20:40:59 +0800
Subject: [PATCH 5/9] Fix attentionXML: get UNK like torchtext

---
 libmultilabel/nn/attentionxml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmultilabel/nn/attentionxml.py b/libmultilabel/nn/attentionxml.py
index b54776ac..747f1b05 100644
--- a/libmultilabel/nn/attentionxml.py
+++ b/libmultilabel/nn/attentionxml.py
@@ -489,7 +489,7 @@ def reformat_text(self, dataset):
         # Convert words to numbers according to their indices in word_dict. Then pad each instance to a certain length.
         encoded_text = list(
             map(
-                lambda text: torch.tensor([self.word_dict[word] for word in text], dtype=torch.int64)
+                lambda text: torch.tensor([self.word_dict.get(word, self.word_dict[UNK]) for word in text], dtype=torch.int64)
                 if text
                 else torch.tensor([self.word_dict[UNK]], dtype=torch.int64),
                 [instance["text"][: self.max_seq_length] for instance in dataset],

From 5db5943f8676a5d48775483eb2d8fbf53a8c5137 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Sun, 27 Jul 2025 17:32:17 +0800
Subject: [PATCH 6/9] Update CUDA and Pytorch versions.

---
 README.md                      | 4 ++--
 libmultilabel/nn/data_utils.py | 7 ++++---
 setup.cfg                      | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 3e4e7d41..5504daaa 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,8 @@ This is an on-going development so many improvements are still being made. Comme
 
 ## Environments
 - Python: 3.10+
-- CUDA: 11.8, 12.6 (if training neural networks by GPU)
-- Pytorch: 2.0.1+
+- CUDA: 11.8, 12.1, 12.6 (if training neural networks by GPU)
+- Pytorch: 2.3.0+
 
 If you have a different version of CUDA, follow the installation instructions for PyTorch LTS at their [website](https://pytorch.org/).
 
diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py
index fdd56d09..e14bee06 100644
--- a/libmultilabel/nn/data_utils.py
+++ b/libmultilabel/nn/data_utils.py
@@ -275,7 +275,7 @@ def load_or_build_text_dict(
 ):
     """Build or load the vocabulary from the training dataset or the predefined `vocab_file`.
     The pretrained embedding can be either from a self-defined `embed_file` or from one of
-    the vectors: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, `glove.840B.300d`.
+    the vectors: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, or `glove.840B.300d`.
 
     Args:
         dataset (list): List of training instances with index, label, and tokenized text.
@@ -319,7 +319,8 @@ def load_or_build_text_dict(
 
 
 def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None):
-    r"""Build word dictionary, modified from torchtext.vocab.build-vocab-from-iterator (https://docs.pytorch.org/text/stable/vocab.html#build-vocab-from-iterator)
+    r"""Build word dictionary, modified from `torchtext.vocab.build-vocab-from-iterator` 
+    (https://docs.pytorch.org/text/stable/vocab.html#build-vocab-from-iterator)
 
     Args:
         vocab_list: List of words.
@@ -338,7 +339,7 @@ def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None):
     sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
     ordered_dict = OrderedDict(sorted_by_freq_tuples)
 
-    # add special tokens at the beginning
+    # add special tokens at the beginning 
     tokens = specials or []
     for token, freq in ordered_dict.items():
         if freq >= min_vocab_freq:
diff --git a/setup.cfg b/setup.cfg
index c8a6cb3d..290089c6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -12,7 +12,7 @@ project_urls =
     Documentation = https://www.csie.ntu.edu.tw/~cjlin/libmultilabel
     Source Code = https://github.com/ASUS-AICS/LibMultiLabel/
 classifiers =
-    Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.6
+    Environment :: GPU :: NVIDIA CUDA :: 12
     Environment :: GPU :: NVIDIA CUDA :: 11.8
     Intended Audience :: Developers
     Intended Audience :: Education

From 948f6274fd0fad7e9d806fe62b5508703904b280 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Tue, 29 Jul 2025 19:00:43 +0800
Subject: [PATCH 7/9] Update data_utils.py: (1) finalize doc strings (2) move
 ".vector_cache" from main to _down.. for API

---
 docs/cli/nn.rst                |  2 +-
 libmultilabel/nn/data_utils.py | 46 ++++++++++++++++++++--------------
 main.py                        |  3 +--
 3 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/docs/cli/nn.rst b/docs/cli/nn.rst
index 41eb9a54..102231fa 100644
--- a/docs/cli/nn.rst
+++ b/docs/cli/nn.rst
@@ -77,7 +77,7 @@ If a model was trained before by this package, the training procedure can start
 
 To use your own word embeddings or vocabulary set, specify the following parameters:
 
-- **embed_file**: choose one of the pretrained embeddings: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, `glove.840B.300d`, or specify the path to your word embeddings with each line containing a word followed by its vectors.
+- **embed_file**: choose one of the pretrained embeddings: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, `glove.840B.300d`, or specify the path to your word embeddings with each line containing a word followed by its vectors. Example:
 
 .. code-block::
 
diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py
index e14bee06..1907ee53 100644
--- a/libmultilabel/nn/data_utils.py
+++ b/libmultilabel/nn/data_utils.py
@@ -23,7 +23,7 @@
 
 UNK = "<unk>"
 PAD = "<pad>"
-PRETRAINED_ALIASES = {
+GLOVE_WORD_EMBEDDING = {
     "glove.42B.300d",
     "glove.840B.300d",
     "glove.6B.50d",
@@ -164,6 +164,7 @@ def _load_raw_data(data, is_test=False, tokenize_text=True, remove_no_label_data
     Args:
         data (Union[str, pandas,.Dataframe]): Training, test, or validation data in file or dataframe.
         is_test (bool, optional): Whether the data is for test or not. Defaults to False.
+        tokenize_text (bool, optional): Whether to tokenize text. Defaults to True.
         remove_no_label_data (bool, optional): Whether to remove training/validation instances that have no labels.
             This is effective only when is_test=False. Defaults to False.
 
@@ -281,7 +282,7 @@ def load_or_build_text_dict(
         dataset (list): List of training instances with index, label, and tokenized text.
         vocab_file (str, optional): Path to a file holding vocabuaries. Defaults to None.
         min_vocab_freq (int, optional): The minimum frequency needed to include a token in the vocabulary. Defaults to 1.
-        embed_file (str): Path to a file holding pre-trained embeddings.
+        embed_file (str): Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding. Defaults to None.
         embed_cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
         silent (bool, optional): Enable silent mode. Defaults to False.
         normalize_embed (bool, optional): Whether the embeddings of each word is normalized to a unit vector. Defaults to False.
@@ -319,13 +320,13 @@ def load_or_build_text_dict(
 
 
 def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None):
-    r"""Build word dictionary, modified from `torchtext.vocab.build-vocab-from-iterator` 
+    r"""Build word dictionary, modified from `torchtext.vocab.build-vocab-from-iterator`
     (https://docs.pytorch.org/text/stable/vocab.html#build-vocab-from-iterator)
 
     Args:
         vocab_list: List of words.
         min_vocab_freq (int, optional): The minimum frequency needed to include a token in the vocabulary. Defaults to 1.
-        specials: Special tokens (e.g., <unk>, <pad>) to add.
+        specials: Special tokens (e.g., <unk>, <pad>) to add. Defaults to None.
 
     Returns:
         dict: A dictionary which maps tokens to indices.
@@ -339,7 +340,7 @@ def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None):
     sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
     ordered_dict = OrderedDict(sorted_by_freq_tuples)
 
-    # add special tokens at the beginning 
+    # add special tokens at the beginning
     tokens = specials or []
     for token, freq in ordered_dict.items():
         if freq >= min_vocab_freq:
@@ -388,12 +389,14 @@ def load_or_build_label(datasets, label_file=None, include_test_labels=False):
 
 
 def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_dir=None):
-    """If the word exists in the embedding file, load the pretrained word embedding.
-    Otherwise, assign a zero vector to that word.
+    """Obtain the word embeddings from file. If the word exists in the embedding file, 
+    load the pretrained word embedding. Otherwise, assign a zero vector to that word.
+    If the given `embed_file` is the name of a pretrained GloVe embedding, the function 
+    will first download the corresponding file.
 
     Args:
         word_dict (dict): A dictionary for mapping tokens to indices.
-        embed_file (str): Path to a file holding pre-trained embeddings.
+        embed_file (str): Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding.
         silent (bool, optional): Enable silent mode. Defaults to False.
         cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
 
@@ -401,14 +404,14 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_d
         torch.Tensor: Embedding weights (vocab_size, embed_size).
     """
 
-    if embed_file in PRETRAINED_ALIASES:
-        embed_file = _download_pretrained_embedding(embed_file, cache_dir=cache_dir)
+    if embed_file in GLOVE_WORD_EMBEDDING:
+        embed_file = _download_glove_embedding(embed_file, cache_dir=cache_dir)
     elif not os.path.isfile(embed_file):
         raise ValueError(
-            "Got embed_file {}, but allowed pretrained " "embeddings are {}".format(embed_file, PRETRAINED_ALIASES)
+            "Got embed_file {}, but allowed pretrained " "embeddings are {}".format(embed_file, GLOVE_WORD_EMBEDDING)
         )
 
-    logging.info(f"Load pretrained embedding from file: {embed_file}.")
+    logging.info(f"Load pretrained embedding from {embed_file}.")
     with open(embed_file) as f:
         word_vectors = f.readlines()
     embed_size = len(word_vectors[0].split()) - 1
@@ -433,25 +436,30 @@ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_d
             embedding_weights[word_dict[word]] = vector_dict[word]
             vec_counts += 1
 
-    logging.info(f"loaded {vec_counts}/{len(word_dict)} word embeddings")
+    logging.info(f"Loaded {vec_counts}/{len(word_dict)} word embeddings")
 
     return embedding_weights
 
 
-def _download_pretrained_embedding(embed_file, cache_dir=None):
+def _download_glove_embedding(embed_name, cache_dir=None):
     """Download pretrained glove embedding from https://huggingface.co/stanfordnlp/glove/tree/main.
 
+    Args:
+        embed_name (str): The name of the pretrained GloVe embedding. Defaults to None.
+        cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
+
     Returns:
-        str: Path to the cached or downloaded embedding file.
+        str: Path to the file that contains the cached embeddings.
     """
-    cached_embed_file = f"{cache_dir}/{embed_file}.txt"
+    cache_dir = ".vector_cache" if cache_dir is None else cache_dir
+    cached_embed_file = f"{cache_dir}/{embed_name}.txt"
     if os.path.isfile(cached_embed_file):
         return cached_embed_file
     os.makedirs(cache_dir, exist_ok=True)
 
-    remote_embed_file = re.sub(r"6B.*", "6B", embed_file) + ".zip"
+    remote_embed_file = re.sub(r"6B.*", "6B", embed_name) + ".zip"
     url = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{remote_embed_file}"
-    logging.info(f"Downloading pretrained embedding from {url}.")
+    logging.info(f"Downloading pretrained embeddings from {url}.")
     try:
         zip_file, _ = urlretrieve(url, f"{cache_dir}/{remote_embed_file}")
         with zipfile.ZipFile(zip_file, "r") as zf:
@@ -459,5 +467,5 @@ def _download_pretrained_embedding(embed_file, cache_dir=None):
     except Exception as e:
         os.remove(zip_file)
         raise e
-
+    logging.info(f"Downloaded pretrained embeddings {embed_name} to {cached_embed_file}.")
     return cached_embed_file
diff --git a/main.py b/main.py
index d87fbdf9..3a1aa98c 100644
--- a/main.py
+++ b/main.py
@@ -141,7 +141,7 @@ def add_all_arguments(parser):
     # pretrained vocab / embeddings
     parser.add_argument("--vocab_file", type=str, help="Path to a file holding vocabuaries (default: %(default)s)")
     parser.add_argument(
-        "--embed_file", type=str, help="Path to a file holding pre-trained embeddings (default: %(default)s)"
+        "--embed_file", type=str, help="Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding (default: %(default)s)"
     )
     parser.add_argument("--label_file", type=str, help="Path to a file holding all labels (default: %(default)s)")
 
@@ -189,7 +189,6 @@ def add_all_arguments(parser):
     parser.add_argument(
         "--embed_cache_dir",
         type=str,
-        default=".vector_cache",
         help="For parameter search only: path to a directory for storing embeddings for multiple runs. (default: %(default)s)",
     )
     parser.add_argument(

From eb7614e848962e2762e7ba6919fc000ba634c590 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Tue, 29 Jul 2025 19:01:13 +0800
Subject: [PATCH 8/9] bump version: 0.8.1

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 290089c6..778e0958 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = libmultilabel
-version = 0.8.0
+version = 0.8.1
 author = LibMultiLabel Team
 license = MIT License
 license_file = LICENSE

From 084b63f66a27d3f9158f0974e29c34a2dd0e41d2 Mon Sep 17 00:00:00 2001
From: Eleven Liu <jiejyunliu@gmail.com>
Date: Tue, 29 Jul 2025 19:22:15 +0800
Subject: [PATCH 9/9] Apply black formatter

---
 libmultilabel/nn/data_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py
index 1907ee53..950e9669 100644
--- a/libmultilabel/nn/data_utils.py
+++ b/libmultilabel/nn/data_utils.py
@@ -389,9 +389,9 @@ def load_or_build_label(datasets, label_file=None, include_test_labels=False):
 
 
 def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_dir=None):
-    """Obtain the word embeddings from file. If the word exists in the embedding file, 
+    """Obtain the word embeddings from file. If the word exists in the embedding file,
     load the pretrained word embedding. Otherwise, assign a zero vector to that word.
-    If the given `embed_file` is the name of a pretrained GloVe embedding, the function 
+    If the given `embed_file` is the name of a pretrained GloVe embedding, the function
     will first download the corresponding file.
 
     Args: