|
1 | | -"""Wrapper around in-memory DocArray store.""" |
| 1 | +"""Wrapper around in-memory storage.""" |
2 | 2 | from __future__ import annotations |
3 | 3 |
|
4 | | -from typing import List, Optional, Any, Type |
5 | | - |
6 | | -from docarray.typing import NdArray |
| 4 | +from typing import List, Optional, Type |
7 | 5 |
|
8 | 6 | from langchain.embeddings.base import Embeddings |
9 | | -from langchain.schema import Document |
10 | 7 | from langchain.vectorstores.base import VST |
11 | | -from langchain.vectorstores.utils import maximal_marginal_relevance |
12 | | -from langchain.vectorstores.vector_store_from_doc_index import _check_docarray_import, VecStoreFromDocIndex |
| 8 | +from langchain.vectorstores.vector_store_from_doc_index import ( |
| 9 | + VecStoreFromDocIndex, |
| 10 | + _check_docarray_import, |
| 11 | +) |
13 | 12 |
|
14 | 13 |
|
15 | 14 | class InMemory(VecStoreFromDocIndex): |
16 | 15 | """Wrapper around in-memory storage. |
17 | 16 |
|
18 | 17 | To use it, you should have the ``docarray`` package with version >=0.31.0 installed. |
| 18 | + You can install it with `pip install "langchain[in_memory_store]"`. |
19 | 19 | """ |
| 20 | + |
20 | 21 | def __init__( |
21 | 22 | self, |
22 | | - texts: List[str], |
23 | 23 | embedding: Embeddings, |
24 | | - metadatas: Optional[List[dict]] = None, |
25 | | - metric: str = 'cosine_sim', |
| 24 | + metric: str = "cosine_sim", |
26 | 25 | ) -> None: |
27 | 26 | """Initialize in-memory store. |
28 | 27 |
|
29 | 28 | Args: |
30 | | - texts (List[str]): Text data. |
31 | 29 | embedding (Embeddings): Embedding function. |
32 | | - metadatas (Optional[List[dict]]): Metadata for each text if it exists. |
33 | | - Defaults to None. |
34 | 30 | metric (str): metric for exact nearest-neighbor search. |
35 | | - Can be one of: 'cosine_sim', 'euclidean_dist' and 'sqeuclidean_dist'. |
36 | | - Defaults to 'cosine_sim'. |
37 | | -
|
| 31 | + Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". |
| 32 | + Defaults to "cosine_sim". |
38 | 33 | """ |
39 | 34 | _check_docarray_import() |
40 | | - from docarray.index import InMemoryDocIndex |
41 | | - |
42 | | - doc_cls = self._get_doc_cls(metric) |
43 | | - doc_index = InMemoryDocIndex[doc_cls]() |
44 | | - super().__init__(doc_index, texts, embedding, metadatas) |
45 | | - |
46 | | - @staticmethod |
47 | | - def _get_doc_cls(sim_metric: str): |
48 | | - from docarray import BaseDoc |
49 | | - from pydantic import Field |
50 | | - |
51 | | - class DocArrayDoc(BaseDoc): |
52 | | - text: Optional[str] |
53 | | - embedding: Optional[NdArray] = Field(space=sim_metric) |
54 | | - metadata: Optional[dict] |
| 35 | + from docarray.index import InMemoryExactNNIndex |
55 | 36 |
|
56 | | - return DocArrayDoc |
| 37 | + doc_cls = self._get_doc_cls({"space": metric}) |
| 38 | + doc_index = InMemoryExactNNIndex[doc_cls]() |
| 39 | + super().__init__(doc_index, embedding) |
57 | 40 |
|
58 | 41 | @classmethod |
59 | 42 | def from_texts( |
60 | 43 | cls: Type[VST], |
61 | 44 | texts: List[str], |
62 | 45 | embedding: Embeddings, |
63 | 46 | metadatas: Optional[List[dict]] = None, |
64 | | - metric: str = 'cosine_sim', |
65 | | - **kwargs: Any |
| 47 | + metric: str = "cosine_sim", |
66 | 48 | ) -> InMemory: |
67 | | - return cls( |
68 | | - texts=texts, |
69 | | - embedding=embedding, |
70 | | - metadatas=metadatas, |
71 | | - metric=metric, |
72 | | - ) |
73 | | - # |
74 | | - # def add_texts( |
75 | | - # self, |
76 | | - # texts: Iterable[str], |
77 | | - # metadatas: Optional[List[dict]] = None, |
78 | | - # **kwargs: Any |
79 | | - # ) -> List[str]: |
80 | | - # """Run more texts through the embeddings and add to the vectorstore. |
81 | | - # |
82 | | - # Args: |
83 | | - # texts: Iterable of strings to add to the vectorstore. |
84 | | - # metadatas: Optional list of metadatas associated with the texts. |
85 | | - # |
86 | | - # Returns: |
87 | | - # List of ids from adding the texts into the vectorstore. |
88 | | - # """ |
89 | | - # if metadatas is None: |
90 | | - # metadatas = [{} for _ in range(len(list(texts)))] |
91 | | - # |
92 | | - # ids = [] |
93 | | - # embeddings = self.embedding.embed_documents(texts) |
94 | | - # for t, m, e in zip(texts, metadatas, embeddings): |
95 | | - # doc = self.doc_cls( |
96 | | - # text=t, |
97 | | - # embedding=e, |
98 | | - # metadata=m |
99 | | - # ) |
100 | | - # self.docs.append(doc) |
101 | | - # ids.append(doc.id) # TODO return index of self.docs ? |
102 | | - # |
103 | | - # return ids |
104 | | - # |
105 | | - # def similarity_search_with_score( |
106 | | - # self, query: str, k: int = 4, **kwargs: Any |
107 | | - # ) -> List[Tuple[Document, float]]: |
108 | | - # """Return docs most similar to query. |
109 | | - # |
110 | | - # Args: |
111 | | - # query: Text to look up documents similar to. |
112 | | - # k: Number of Documents to return. Defaults to 4. |
113 | | - # |
114 | | - # Returns: |
115 | | - # List of Documents most similar to the query and score for each. |
116 | | - # """ |
117 | | - # from docarray.utils.find import find # TODO move import |
118 | | - # |
119 | | - # query_embedding = self.embedding.embed_query(query) |
120 | | - # query_doc = self.doc_cls(embedding=query_embedding) |
121 | | - # docs, scores = find(index=self.docs, query=query_doc, limit=k, search_field='embedding') |
122 | | - # |
123 | | - # result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] |
124 | | - # return result |
125 | | - # |
126 | | - # def similarity_search( |
127 | | - # self, query: str, k: int = 4, **kwargs: Any |
128 | | - # ) -> List[Document]: |
129 | | - # """Return docs most similar to query. |
130 | | - # |
131 | | - # Args: |
132 | | - # query: Text to look up documents similar to. |
133 | | - # k: Number of Documents to return. Defaults to 4. |
134 | | - # |
135 | | - # Returns: |
136 | | - # List of Documents most similar to the query. |
137 | | - # """ |
138 | | - # results = self.similarity_search_with_score(query, k) |
139 | | - # return list(map(itemgetter(0), results)) |
140 | | - # |
141 | | - # def _similarity_search_with_relevance_scores( |
142 | | - # self, |
143 | | - # query: str, |
144 | | - # k: int = 4, |
145 | | - # **kwargs: Any, |
146 | | - # ) -> List[Tuple[Document, float]]: |
147 | | - # """Return docs and relevance scores, normalized on a scale from 0 to 1. |
148 | | - # |
149 | | - # 0 is dissimilar, 1 is most similar. |
150 | | - # """ |
151 | | - # raise NotImplementedError |
152 | | - # |
153 | | - # def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: |
154 | | - # """Return docs most similar to embedding vector. |
155 | | - # |
156 | | - # Args: |
157 | | - # embedding: Embedding to look up documents similar to. |
158 | | - # k: Number of Documents to return. Defaults to 4. |
159 | | - # |
160 | | - # Returns: |
161 | | - # List of Documents most similar to the query vector. |
162 | | - # """ |
163 | | - # from docarray.utils.find import find |
164 | | - # |
165 | | - # query_doc = self.doc_cls(embedding=embedding) |
166 | | - # result_docs = find(index=self.docs, query=query_doc, limit=k, search_field='embedding').documents |
167 | | - # |
168 | | - # result = [Document(page_content=doc.text) for doc in result_docs] |
169 | | - # return result |
170 | | - |
171 | | - def max_marginal_relevance_search( |
172 | | - self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any |
173 | | - ) -> List[Document]: |
174 | | - """Return docs selected using the maximal marginal relevance. |
175 | | -
|
176 | | - Maximal marginal relevance optimizes for similarity to query AND diversity |
177 | | - among selected documents. |
| 49 | + """Create an in-memory store and insert data. |
178 | 50 |
|
179 | 51 | Args: |
180 | | - query: Text to look up documents similar to. |
181 | | - k: Number of Documents to return. Defaults to 4. |
182 | | - fetch_k: Number of Documents to fetch to pass to MMR algorithm. |
| 52 | + texts (List[str]): Text data. |
| 53 | + embedding (Embeddings): Embedding function. |
| 54 | + metadatas (Optional[List[dict]]): Metadata for each text if it exists. |
| 55 | + Defaults to None. |
| 56 | + metric (str): metric for exact nearest-neighbor search. |
| 57 | + Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". |
| 58 | + Defaults to "cosine_sim". |
183 | 59 |
|
184 | 60 | Returns: |
185 | | - List of Documents selected by maximal marginal relevance. |
186 | | - """ |
187 | | - from docarray.utils.find import find |
188 | | - |
189 | | - query_embedding = self.embedding.embed_query(query) |
190 | | - query_doc = self.doc_cls(embedding=query_embedding) |
191 | | - find_res = find(self.docs, query_doc, limit=k) |
192 | | - |
193 | | - embeddings = [emb for emb in find_res.documents.emb] |
194 | | - mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) |
195 | | - results = [] |
196 | | - for idx in mmr_selected: |
197 | | - results.append(Document(page_content=self.docs[idx].text)) |
198 | | - return results |
199 | | - |
| 61 | + InMemory Vector Store |
| 62 | + """ |
| 63 | + store = cls( |
| 64 | + embedding=embedding, |
| 65 | + metric=metric, |
| 66 | + ) |
| 67 | + store.add_texts(texts=texts, metadatas=metadatas) |
| 68 | + return store |
0 commit comments