MinishLab
diff --git a/‎README.md‎
Lines changed: 22 additions & 10 deletions b/‎README.md‎
Lines changed: 22 additions & 10 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 23 additions & 6 deletions b/‎pyproject.toml‎
Lines changed: 23 additions & 6 deletions
diff --git a/‎tokenlearn/featurize.py‎
Lines changed: 66 additions & 23 deletions b/‎tokenlearn/featurize.py‎
Lines changed: 66 additions & 23 deletions
@@ -3,22 +3,25 @@ Tokenlearn is a method to pre-train [Model2Vec](https://github.com/MinishLab/mod
 
 The method is described in detail in our [Tokenlearn blogpost](https://minishlab.github.io/tokenlearn_blogpost/).
 
-## Usage
+## Quickstart
 
-### Featurizing
-Tokenlearn is trained using means from a sentence transformer. To create means, the `featurize` script can be used:
+Install the package with:
 
 ```bash
-python tokenlearn/featurize.py
+pip install tokenlearn
 ```
 
-This will create means for [C4](https://huggingface.co/datasets/allenai/c4) using [bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5).
+The basic usage of Tokenlearn consists of two CLI scripts: `featurize` and `train`.
 
-### Training
-The easiest way to train using Tokenlearn is to use the CLI. You can use the following command to train a model:
+Tokenlearn is trained using means from a sentence transformer. To create means, the `tokenlearn-featurize` CLI can be used:
 
 ```bash
-python train.py --data-path <path-to-your-data> --save-path <path-to-save-model>
+python3 -m tokenlearn.featurize --model-name "baai/bge-base-en-v1.5" --output-dir "data/c4_features"
+```
+
+To train a model on the featurized data, the `tokenlearn-train` CLI can be used:
+```bash
+python3 -m tokenlearn.train --model-name "baai/bge-base-en-v1.5" --data-path "data/c4_features" --save-path "<path-to-save-model>"
 ```
 
 Training will create two models:
@@ -27,9 +30,14 @@ Training will create two models:
 
 NOTE: the code assumes that the padding token ID in your tokenizer is 0. If this is not the case, you will need to modify the code.
 
-### Evaluating
+### Evaluation
 
-To evaluate a model, you can use the following command:
+To evaluate a model, you can use the following command after installing the optional evaluation dependencies:
+
+```bash
+pip install evaluation@git+https://github.com/MinishLab/evaluation@main
+
+```
 
 ```python
 from model2vec import StaticModel
@@ -61,3 +69,7 @@ task_scores = summarize_results(parsed_results)
 # Print the results in a leaderboard format
 print(make_leaderboard(task_scores))
 ```
+
+## License
+
+MIT
@@ -1,18 +1,32 @@
 [project]
 name = "tokenlearn"
-description = "Pre-train static embedders."
+description = "Pre-train Static Embedders"
 readme = "README.md"
-version = "0.1.0"
-requires-python = ">=3.10"
+requires-python = ">=3.9"
+authors = [{name = "Thomas van Dongen", email = "thomas123@live.nl"}, { name = "Stéphan Tulkens", email = "stephantul@gmail.com"}]
+dynamic = ["version"]
+
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Natural Language :: English",
+]
 
 dependencies = [
-    "model2vec>=0.3.0",
+    "model2vec[distill]>=0.3.0",
     "sentence-transformers",
     "torch",
     "datasets",
     "more-itertools>=10.5.0",
-    "reach@git+https://github.com/stephantul/reach@main",
-    "evaluation@git+https://github.com/MinishLab/evaluation@main"
 ]
 
 [build-system]
@@ -77,3 +91,6 @@ packages = ["tokenlearn"]
 
 [tool.setuptools_scm]
 # can be empty if no extra settings are needed, presence enables setuptools_scm
+
+[tool.setuptools.dynamic]
+version = {attr = "tokenlearn.version.__version__"}
@@ -1,17 +1,39 @@
 import argparse
+import json
+import logging
 from pathlib import Path
 from typing import Iterable
 
 import numpy as np
 from datasets import load_dataset
 from more_itertools import batched
-from reach import Reach
 from sentence_transformers import SentenceTransformer
 from tqdm import tqdm
 
 _SAVE_INTERVAL = 10
 _MAX_MEANS = 1100000
 
+logger = logging.getLogger(__name__)
+
+
+def save_data(means: list[np.ndarray], txts: list[str], base_filepath: str) -> None:
+    """
+    Save the means and texts to separate files.
+
+    :param means: List of numpy arrays representing the mean embeddings.
+    :param txts: List of texts corresponding to the embeddings.
+    :param base_filepath: Base path for the output files.
+    """
+    vectors_filepath = base_filepath + "_vectors.npy"
+    items_filepath = base_filepath + "_items.json"
+
+    # Save the embeddings (vectors) to a .npy file
+    np.save(vectors_filepath, np.array(means))
+    # Save the texts to a JSON file
+    with open(items_filepath, "w") as f:
+        json.dump({"items": txts}, f)
+    logger.info(f"Saved {len(txts)} texts to {items_filepath} and vectors to {vectors_filepath}")
+
 
 def featurize(texts: Iterable[str], model: SentenceTransformer, output_dir: str) -> None:
     """
@@ -35,55 +57,76 @@ def featurize(texts: Iterable[str], model: SentenceTransformer, output_dir: str)
 
     for index, batch in enumerate(tqdm(batched(texts, 32))):
         i = index // _SAVE_INTERVAL
-        if (out_path / f"featurized_{i}.json").exists():
-            continue
-        # Consume the generator
+        base_filename = f"featurized_{i}"
+        vectors_filepath = out_path / (base_filename + "_vectors.npy")
+        items_filepath = out_path / (base_filename + "_items.json")
         list_batch = [x["text"].strip() for x in batch if x.get("text")]
+        if not list_batch:
+            continue  # Skip empty batches
+
+        # Encode the batch to get token embeddings
+        token_embeddings = model.encode(
+            list_batch,
+            output_value="token_embeddings",
+            convert_to_tensor=True,
+        )
 
-        # Already truncated to model max_length
+        # Tokenize the batch to get input IDs
         tokenized_ids = model.tokenize(list_batch)["input_ids"]
-        token_embeddings: list[np.ndarray] = [
-            x.cpu().numpy() for x in model.encode(list_batch, output_value="token_embeddings", convert_to_numpy=True)
-        ]
 
-        for tokenized_id, token_embedding in zip(tokenized_ids, token_embeddings, strict=True):
-            # Truncate to actual length of vectors, remove CLS and SEP.
-            text = model.tokenizer.decode(tokenized_id[1 : len(token_embedding) - 1])
+        for tokenized_id, token_embedding in zip(tokenized_ids, token_embeddings):
+            # Convert token IDs to tokens (excluding special tokens)
+            token_ids = tokenized_id[1:-1]
+            # Decode tokens to text
+            text = model.tokenizer.decode(token_ids)
             if text in seen:
                 continue
             seen.add(text)
-            mean = np.mean(token_embedding[1:-1], axis=0)
+            # Get the corresponding token embeddings (excluding special tokens)
+            token_embeds = token_embedding[1:-1]
+            # Convert embeddings to NumPy arrays
+            token_embeds = token_embeds.detach().cpu().numpy()
+            # Compute the mean of the token embeddings
+            mean = np.mean(token_embeds, axis=0)
             txts.append(text)
             means.append(mean)
             total_means += 1
 
             if total_means >= _MAX_MEANS:
-                # Save the final batch and stop
-                r = Reach(means, txts)
-                r.save(out_path / f"featurized_{(index // _SAVE_INTERVAL)}.json")
+                save_data(means, txts, str(out_path / base_filename))
                 return
 
         if index > 0 and (index + 1) % _SAVE_INTERVAL == 0:
-            r = Reach(means, txts)
-            r.save(out_path / f"featurized_{(index // _SAVE_INTERVAL)}.json")
+            save_data(means, txts, str(out_path / base_filename))
             txts = []
             means = []
             seen = set()
     else:
-        if means:
-            r = Reach(means, txts)
-            r.save(out_path / f"featurized_{(index // _SAVE_INTERVAL)}.json")
+        if txts and means:
+            save_data(means, txts, str(out_path / base_filename))
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Train a Model2Vec using tokenlearn.")
+def main() -> None:
+    """Main function to featurize texts using a sentence transformer."""
+    parser = argparse.ArgumentParser(description="Featurize texts using a sentence transformer.")
     parser.add_argument(
         "--model-name",
         type=str,
         default="baai/bge-base-en-v1.5",
         help="The model name for distillation (e.g., 'baai/bge-base-en-v1.5').",
     )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data/c4_bgebase",
+        help="Directory to save the featurized texts.",
+    )
     args = parser.parse_args()
+
     model = SentenceTransformer(args.model_name)
     dataset = load_dataset("allenai/c4", name="en", split="train", streaming=True)
-    featurize(dataset, model, "data/c4_bgebase")
+    featurize(dataset, model, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()