forked from langchain-ai/langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
Add InMemory and HnswLib vector stores #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
anna-charlotte
wants to merge
6
commits into
master
Choose a base branch
from
docarray-vectorstore
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 4 commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
41433e6
feat: add in-memory and hnswlib vectorstore
anna-charlotte b687fd4
refactor: use abtract VecStoreFromDocIndex for in memory and hnswlib …
anna-charlotte de262f9
fix: clean up and add dependencies
anna-charlotte 30456bc
Add more configurations for hnswlib
anna-charlotte 5d2324a
refactor: rename InMemory to InMemoryExactSearch
anna-charlotte ecc73b4
fix: change space default for hnswlib to l2
anna-charlotte File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,141 @@ | ||
| """Wrapper around HnswLib store.""" | ||
| from __future__ import annotations | ||
|
|
||
| from typing import List, Optional, Type | ||
|
|
||
| from langchain.embeddings.base import Embeddings | ||
| from langchain.vectorstores.base import VST | ||
| from langchain.vectorstores.vector_store_from_doc_index import ( | ||
| VecStoreFromDocIndex, | ||
| _check_docarray_import, | ||
| ) | ||
|
|
||
|
|
||
| class HnswLib(VecStoreFromDocIndex): | ||
| """Wrapper around HnswLib storage. | ||
|
|
||
| To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed. | ||
| You can install it with `pip install "langchain[hnswlib]"`. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| embedding: Embeddings, | ||
| work_dir: str, | ||
| n_dim: int, | ||
| dist_metric: str = "cosine", | ||
| max_elements: int = 1024, | ||
| index: bool = True, | ||
| ef_construction: int = 200, | ||
| ef: int = 10, | ||
| M: int = 16, | ||
| allow_replace_deleted: bool = True, | ||
| num_threads: int = 1, | ||
| ) -> None: | ||
| """Initialize HnswLib store. | ||
|
|
||
| Args: | ||
| embedding (Embeddings): Embedding function. | ||
| work_dir (str): path to the location where all the data will be stored. | ||
| n_dim (int): dimension of an embedding. | ||
| dist_metric (str): Distance metric for HnswLib can be one of: "cosine", | ||
| "ip", and "l2". Defaults to "cosine". | ||
| max_elements (int): Maximum number of vectors that can be stored. | ||
| Defaults to 1024. | ||
| index (bool): Whether an index should be built for this field. | ||
| Defaults to True. | ||
| ef_construction (int): defines a construction time/accuracy trade-off. | ||
| Defaults to 200. | ||
| ef (int): parameter controlling query time/accuracy trade-off. | ||
| Defaults to 10. | ||
| M (int): parameter that defines the maximum number of outgoing | ||
| connections in the graph. Defaults to 16. | ||
| allow_replace_deleted (bool): Enables replacing of deleted elements | ||
| with new added ones. Defaults to True. | ||
| num_threads (int): Sets the number of cpu threads to use. Defaults to 1. | ||
| """ | ||
| _check_docarray_import() | ||
| from docarray.index import HnswDocumentIndex | ||
|
|
||
| try: | ||
| import google.protobuf | ||
| except ImportError: | ||
| raise ImportError( | ||
| "Could not import all required packages. " | ||
| "Please install it with `pip install \"langchain[hnswlib]\"`." | ||
| ) | ||
|
|
||
| doc_cls = self._get_doc_cls( | ||
| { | ||
| "dim": n_dim, | ||
| "space": dist_metric, | ||
| "max_elements": max_elements, | ||
| "index": index, | ||
| "ef_construction": ef_construction, | ||
| "ef": ef, | ||
| "M": M, | ||
| "allow_replace_deleted": allow_replace_deleted, | ||
| "num_threads": num_threads, | ||
| } | ||
| ) | ||
| doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) | ||
| super().__init__(doc_index, embedding) | ||
|
|
||
| @classmethod | ||
| def from_texts( | ||
| cls: Type[VST], | ||
| texts: List[str], | ||
| embedding: Embeddings, | ||
| metadatas: Optional[List[dict]] = None, | ||
| work_dir: str = None, | ||
| n_dim: int = None, | ||
| dist_metric: str = "cosine", | ||
| max_elements: int = 1024, | ||
| index: bool = True, | ||
| ef_construction: int = 200, | ||
| ef: int = 10, | ||
| M: int = 16, | ||
| allow_replace_deleted: bool = True, | ||
| num_threads: int = 1, | ||
| ) -> HnswLib: | ||
| """Create an HnswLib store and insert data. | ||
|
|
||
| Args: | ||
| texts (List[str]): Text data. | ||
| embedding (Embeddings): Embedding function. | ||
| metadatas (Optional[List[dict]]): Metadata for each text if it exists. | ||
| Defaults to None. | ||
| work_dir (str): path to the location where all the data will be stored. | ||
| n_dim (int): dimension of an embedding. | ||
| dist_metric (str): Distance metric for HnswLib can be one of: "cosine", | ||
| "ip", and "l2". Defaults to "cosine". | ||
| max_elements (int): Maximum number of vectors that can be stored. | ||
| Defaults to 1024. | ||
| index (bool): Whether an index should be built for this field. | ||
| Defaults to True. | ||
| ef_construction (int): defines a construction time/accuracy trade-off. | ||
| Defaults to 200. | ||
| ef (int): parameter controlling query time/accuracy trade-off. | ||
| Defaults to 10. | ||
| M (int): parameter that defines the maximum number of outgoing | ||
| connections in the graph. Defaults to 16. | ||
| allow_replace_deleted (bool): Enables replacing of deleted elements | ||
| with new added ones. Defaults to True. | ||
| num_threads (int): Sets the number of cpu threads to use. Defaults to 1. | ||
|
|
||
| Returns: | ||
| HnswLib Vector Store | ||
| """ | ||
| if work_dir is None: | ||
| raise ValueError("`work_dir` parameter hs not been set.") | ||
| if n_dim is None: | ||
| raise ValueError("`n_dim` parameter has not been set.") | ||
|
|
||
| store = cls( | ||
| work_dir=work_dir, | ||
| n_dim=n_dim, | ||
| embedding=embedding, | ||
| dist_metric=dist_metric, | ||
| ) | ||
| store.add_texts(texts=texts, metadatas=metadatas) | ||
| return store |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,68 @@ | ||
| """Wrapper around in-memory storage.""" | ||
| from __future__ import annotations | ||
|
|
||
| from typing import List, Optional, Type | ||
|
|
||
| from langchain.embeddings.base import Embeddings | ||
| from langchain.vectorstores.base import VST | ||
| from langchain.vectorstores.vector_store_from_doc_index import ( | ||
| VecStoreFromDocIndex, | ||
| _check_docarray_import, | ||
| ) | ||
|
|
||
|
|
||
| class InMemory(VecStoreFromDocIndex): | ||
| """Wrapper around in-memory storage. | ||
|
|
||
| To use it, you should have the ``docarray`` package with version >=0.31.0 installed. | ||
| You can install it with `pip install "langchain[in_memory_store]"`. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| embedding: Embeddings, | ||
| metric: str = "cosine_sim", | ||
| ) -> None: | ||
| """Initialize in-memory store. | ||
|
|
||
| Args: | ||
| embedding (Embeddings): Embedding function. | ||
| metric (str): metric for exact nearest-neighbor search. | ||
| Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". | ||
| Defaults to "cosine_sim". | ||
| """ | ||
| _check_docarray_import() | ||
| from docarray.index import InMemoryExactNNIndex | ||
|
|
||
| doc_cls = self._get_doc_cls({"space": metric}) | ||
| doc_index = InMemoryExactNNIndex[doc_cls]() | ||
| super().__init__(doc_index, embedding) | ||
|
|
||
| @classmethod | ||
| def from_texts( | ||
| cls: Type[VST], | ||
| texts: List[str], | ||
| embedding: Embeddings, | ||
| metadatas: Optional[List[dict]] = None, | ||
| metric: str = "cosine_sim", | ||
| ) -> InMemory: | ||
| """Create an in-memory store and insert data. | ||
|
|
||
| Args: | ||
| texts (List[str]): Text data. | ||
| embedding (Embeddings): Embedding function. | ||
| metadatas (Optional[List[dict]]): Metadata for each text if it exists. | ||
| Defaults to None. | ||
| metric (str): metric for exact nearest-neighbor search. | ||
| Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". | ||
| Defaults to "cosine_sim". | ||
|
|
||
| Returns: | ||
| InMemory Vector Store | ||
| """ | ||
| store = cls( | ||
| embedding=embedding, | ||
| metric=metric, | ||
| ) | ||
| store.add_texts(texts=texts, metadatas=metadatas) | ||
| return store |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.