-
Notifications
You must be signed in to change notification settings - Fork 34
INTPYTHON-752 Integrate pymongo-search-utils #208
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,9 +22,10 @@ | |
| from langchain_core.embeddings import Embeddings | ||
| from langchain_core.runnables.config import run_in_executor | ||
| from langchain_core.vectorstores import VectorStore | ||
| from pymongo import MongoClient, ReplaceOne | ||
| from pymongo import MongoClient | ||
| from pymongo.collection import Collection | ||
| from pymongo.errors import CollectionInvalid | ||
| from pymongo_search_utils import bulk_embed_and_insert_texts | ||
|
|
||
| from langchain_mongodb.index import ( | ||
| create_vector_search_index, | ||
|
|
@@ -238,7 +239,7 @@ def __init__( | |
| self._relevance_score_fn = relevance_score_fn | ||
|
|
||
| # append_metadata was added in PyMongo 4.14.0, but is a valid database name on earlier versions | ||
| _append_client_metadata(self._collection.database.client) | ||
| _append_client_metadata(self._collection.database.client, DRIVER_METADATA) | ||
|
|
||
| if auto_create_index is False: | ||
| return | ||
|
|
@@ -362,12 +363,23 @@ def add_texts( | |
| metadatas_batch.append(metadata) | ||
| if (j + 1) % batch_size == 0 or size >= 47_000_000: | ||
| if ids: | ||
| batch_res = self.bulk_embed_and_insert_texts( | ||
| texts_batch, metadatas_batch, ids[i : j + 1] | ||
| batch_res = bulk_embed_and_insert_texts( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This API will change with auto-embedding coming up. It should be insert_texts with perhaps embed as a boolea kwarg. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| embedding_func=self._embedding.embed_documents, | ||
| collection=self._collection, | ||
| text_key=self._text_key, | ||
| embedding_key=self._embedding_key, | ||
| texts=texts_batch, | ||
| metadatas=metadatas_batch, | ||
| ids=ids[i : j + 1], | ||
| ) | ||
| else: | ||
| batch_res = self.bulk_embed_and_insert_texts( | ||
| texts_batch, metadatas_batch | ||
| batch_res = bulk_embed_and_insert_texts( | ||
| embedding_func=self._embedding.embed_documents, | ||
| collection=self._collection, | ||
| text_key=self._text_key, | ||
| embedding_key=self._embedding_key, | ||
| texts=texts_batch, | ||
| metadatas=metadatas_batch, | ||
| ) | ||
| result_ids.extend(batch_res) | ||
| texts_batch = [] | ||
|
|
@@ -376,12 +388,23 @@ def add_texts( | |
| i = j + 1 | ||
| if texts_batch: | ||
| if ids: | ||
| batch_res = self.bulk_embed_and_insert_texts( | ||
| texts_batch, metadatas_batch, ids[i : j + 1] | ||
| batch_res = bulk_embed_and_insert_texts( | ||
| embedding_func=self._embedding.embed_documents, | ||
| collection=self._collection, | ||
| text_key=self._text_key, | ||
| embedding_key=self._embedding_key, | ||
| texts=texts_batch, | ||
| metadatas=metadatas_batch, | ||
| ids=ids[i : j + 1], | ||
| ) | ||
| else: | ||
| batch_res = self.bulk_embed_and_insert_texts( | ||
| texts_batch, metadatas_batch | ||
| batch_res = bulk_embed_and_insert_texts( | ||
| embedding_func=self._embedding.embed_documents, | ||
| collection=self._collection, | ||
| text_key=self._text_key, | ||
| embedding_key=self._embedding_key, | ||
| texts=texts_batch, | ||
| metadatas=metadatas_batch, | ||
| ) | ||
| result_ids.extend(batch_res) | ||
| return result_ids | ||
|
|
@@ -419,39 +442,6 @@ def get_by_ids(self, ids: Sequence[str], /) -> list[Document]: | |
| docs.append(Document(page_content=text, id=oid_to_str(_id), metadata=doc)) | ||
| return docs | ||
|
|
||
| def bulk_embed_and_insert_texts( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We cannot remove public functions of our classes. Instead, wrap those in pymongo-search-utils. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure how to wrap the method with the function from pymongo-search-utils ? @NoahStapp ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We'll need to re-export the |
||
| self, | ||
| texts: Union[List[str], Iterable[str]], | ||
| metadatas: Union[List[dict], Generator[dict, Any, Any]], | ||
| ids: Optional[List[str]] = None, | ||
| ) -> List[str]: | ||
| """Bulk insert single batch of texts, embeddings, and optionally ids. | ||
|
|
||
| See add_texts for additional details. | ||
| """ | ||
| if not texts: | ||
| return [] | ||
| # Compute embedding vectors | ||
| embeddings = self._embedding.embed_documents(list(texts)) | ||
| if not ids: | ||
| ids = [str(ObjectId()) for _ in range(len(list(texts)))] | ||
| docs = [ | ||
| { | ||
| "_id": str_to_oid(i), | ||
| self._text_key: t, | ||
| self._embedding_key: embedding, | ||
| **m, | ||
| } | ||
| for i, t, m, embedding in zip( | ||
| ids, texts, metadatas, embeddings, strict=True | ||
| ) | ||
| ] | ||
| operations = [ReplaceOne({"_id": doc["_id"]}, doc, upsert=True) for doc in docs] | ||
| # insert the documents in MongoDB Atlas | ||
| result = self._collection.bulk_write(operations) | ||
| assert result.upserted_ids is not None | ||
| return [oid_to_str(_id) for _id in result.upserted_ids.values()] | ||
|
|
||
| def add_documents( | ||
| self, | ||
| documents: List[Document], | ||
|
|
@@ -484,8 +474,14 @@ def add_documents( | |
| strict=True, | ||
| ) | ||
| result_ids.extend( | ||
| self.bulk_embed_and_insert_texts( | ||
| texts=texts, metadatas=metadatas, ids=ids[start:end] | ||
| bulk_embed_and_insert_texts( | ||
| embedding_func=self._embedding.embed_documents, | ||
| collection=self._collection, | ||
| text_key=self._text_key, | ||
| embedding_key=self._embedding_key, | ||
| texts=texts, | ||
| metadatas=metadatas, | ||
| ids=ids[start:end], | ||
| ) | ||
| ) | ||
| start = end | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,6 +42,7 @@ dev = [ | |
| "langchain-tests==0.3.22,<1.0", | ||
| "pip>=25.0.1", | ||
| "typing-extensions>=4.12.2", | ||
| "pymongo-search-utils@git+https://github.com/mongodb-labs/pymongo-search-utils.git", | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Has pymongo-search-utils not been published? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Once this integration is tested we can publish, according to @blink1073 and @NoahStapp |
||
| ] | ||
|
|
||
| [tool.pytest.ini_options] | ||
|
|
@@ -77,7 +78,7 @@ lint.select = [ | |
| "B", # flake8-bugbear | ||
| "I", # isort | ||
| ] | ||
| lint.ignore = ["E501", "B008", "UP007", "UP006", "UP035", "UP045"] | ||
| lint.ignore = ["E501", "B008", "UP007", "UP006", "UP035", "UP038", "UP045"] | ||
|
|
||
| [tool.coverage.run] | ||
| omit = ["tests/*"] | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding DRIVER_METADATA is what _append_client_metadata does. It doesn't have any arguments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See above. That said, it's possible that
DRIVER_METADATAis the wrong arg.