diff --git a/chromadb/test/ef/test_ef.py b/chromadb/test/ef/test_ef.py index c93502e3fc8..da10b33e5b2 100644 --- a/chromadb/test/ef/test_ef.py +++ b/chromadb/test/ef/test_ef.py @@ -30,6 +30,7 @@ def test_get_builtins_holds() -> None: "SentenceTransformerEmbeddingFunction", "Text2VecEmbeddingFunction", "ChromaLangchainEmbeddingFunction", + "FastEmbedEmbeddingFunction", } assert expected_builtins == embedding_functions.get_builtins() diff --git a/chromadb/test/ef/test_fastembed_ef.py b/chromadb/test/ef/test_fastembed_ef.py new file mode 100644 index 00000000000..989d240390a --- /dev/null +++ b/chromadb/test/ef/test_fastembed_ef.py @@ -0,0 +1,15 @@ +import pytest + +from chromadb.utils.embedding_functions.fastembed_embedding_function import ( + FastEmbedEmbeddingFunction, +) + +# Skip test if the 'fastembed' package is not installed is not installed +fastembed = pytest.importorskip("fastembed", reason="fastembed not installed") + + +def test_fastembed() -> None: + ef = FastEmbedEmbeddingFunction(model_name="BAAI/bge-small-en-v1.5") + embeddings = ef(["Here is an article about llamas...", "this is another article"]) + assert len(embeddings) == 2 + assert len(embeddings[0]) == 384 diff --git a/chromadb/utils/embedding_functions/fastembed_embedding_function.py b/chromadb/utils/embedding_functions/fastembed_embedding_function.py new file mode 100644 index 00000000000..af6bac84689 --- /dev/null +++ b/chromadb/utils/embedding_functions/fastembed_embedding_function.py @@ -0,0 +1,73 @@ +from typing import Any, Optional, cast + +from chromadb.api.types import Documents, EmbeddingFunction, Embeddings + + +class FastEmbedEmbeddingFunction(EmbeddingFunction[Documents]): + """ + This class is used to generate embeddings for a list of texts using FastEmbed - https://qdrant.github.io/fastembed/. + Find the list of supported models at https://qdrant.github.io/fastembed/examples/Supported_Models/. + """ + + def __init__( + self, + model_name: str = "BAAI/bge-small-en-v1.5", + batch_size: int = 256, + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + parallel: Optional[int] = None, + **kwargs: Any, + ) -> None: + """ + Initialize fastembed.TextEmbedding + + Args: + model_name (str): The name of the model to use. Defaults to `"BAAI/bge-small-en-v1.5"`. + batch_size (int): Batch size for encoding. Higher values will use more memory, but be faster.\ + Defaults to 256. + cache_dir (str, optional): The path to the model cache directory.\ + Can also be set using the `FASTEMBED_CACHE_PATH` env variable. + threads (int, optional): The number of threads single onnxruntime session can use. + parallel (int, optional): If `>1`, data-parallel encoding will be used, recommended for offline encoding of large datasets.\ + If `0`, use all available cores.\ + If `None`, don't use data-parallel processing, use default onnxruntime threading instead.\ + Defaults to None. + **kwargs: Additional options to pass to fastembed.TextEmbedding + + Raises: + ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. + """ + try: + from fastembed import TextEmbedding + except ImportError: + raise ValueError( + "The 'fastembed' package is not installed. Please install it with `pip install fastembed`" + ) + self._batch_size = batch_size + self._parallel = parallel + self._model = TextEmbedding( + model_name=model_name, cache_dir=cache_dir, threads=threads, **kwargs + ) + + def __call__(self, input: Documents) -> Embeddings: + """ + Get the embeddings for a list of texts. + + Args: + input (Documents): A list of texts to get embeddings for. + + Returns: + Embeddings: The embeddings for the texts. + + Example: + >>> fastembed_ef = FastEmbedEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2") + >>> texts = ["Hello, world!", "How are you?"] + >>> embeddings = fastembed_ef(texts) + """ + embeddings = self._model.embed( + input, batch_size=self._batch_size, parallel=self._parallel + ) + return cast( + Embeddings, + [embedding.tolist() for embedding in embeddings], + ) diff --git a/docs/docs.trychroma.com/pages/guides/embeddings.md b/docs/docs.trychroma.com/pages/guides/embeddings.md index d523c7d3089..31a35a15f05 100644 --- a/docs/docs.trychroma.com/pages/guides/embeddings.md +++ b/docs/docs.trychroma.com/pages/guides/embeddings.md @@ -18,6 +18,7 @@ Chroma provides lightweight wrappers around popular embedding providers, making | [Instructor](/integrations/instructor) | ✅ | ➖ | | [Hugging Face Embedding Server](/integrations/hugging-face-server) | ✅ | ✅ | | [Jina AI](/integrations/jinaai) | ✅ | ✅ | +| [FastEmbed](/integrations/fastembed) | ✅ | ➖ | We welcome pull requests to add new Embedding Functions to the community. diff --git a/docs/docs.trychroma.com/pages/integrations/fastembed.md b/docs/docs.trychroma.com/pages/integrations/fastembed.md new file mode 100644 index 00000000000..6fec044a955 --- /dev/null +++ b/docs/docs.trychroma.com/pages/integrations/fastembed.md @@ -0,0 +1,29 @@ +--- +title: FastEmbed +--- + +# FastEmbed + +[FastEmbed](https://qdrant.github.io/fastembed/) is a lightweight, CPU-first Python library built for embedding generation. + +This embedding function requires the `fastembed` package. To install it, run + +```pip install fastembed```. + +You can find a list of all the supported models [here](https://qdrant.github.io/fastembed/examples/Supported_Models/). + +## Example usage + +Using the default BAAI/bge-small-en-v1.5 model. + +```python +from chromadb.utils.embedding_functions.fastembed_embedding_function import FastEmbedEmbeddingFunction +ef = FastEmbedEmbeddingFunction() +``` + +Additionally, you can also configure the cache directory, number of threads and other FastEmbed options. + +```python +from chromadb.utils.embedding_functions import FastEmbedEmbeddingFunction +ef = FastEmbedEmbeddingFunction(model_name="nomic-ai/nomic-embed-text-v1.5", cache_dir="models_cache", threads=5) +```