|
1 | 1 | import os |
2 | 2 |
|
3 | 3 | from langchain.chat_models import ChatOpenAI |
| 4 | +from langchain.document_loaders import PyPDFLoader |
4 | 5 | from langchain.embeddings import OpenAIEmbeddings |
5 | 6 | from langchain.prompts import ChatPromptTemplate |
6 | 7 | from langchain.pydantic_v1 import BaseModel |
7 | 8 | from langchain.schema.output_parser import StrOutputParser |
8 | | -from langchain.schema.runnable import RunnableParallel, RunnablePassthrough |
| 9 | +from langchain.schema.runnable import ( |
| 10 | + RunnableLambda, |
| 11 | + RunnableParallel, |
| 12 | + RunnablePassthrough, |
| 13 | +) |
| 14 | +from langchain.text_splitter import RecursiveCharacterTextSplitter |
9 | 15 | from langchain.vectorstores import MongoDBAtlasVectorSearch |
10 | 16 | from pymongo import MongoClient |
11 | 17 |
|
@@ -54,3 +60,24 @@ class Question(BaseModel): |
54 | 60 |
|
55 | 61 |
|
56 | 62 | chain = chain.with_types(input_type=Question) |
| 63 | + |
| 64 | + |
| 65 | +def _ingest(url: str) -> dict: |
| 66 | + loader = PyPDFLoader(url) |
| 67 | + data = loader.load() |
| 68 | + |
| 69 | + # Split docs |
| 70 | + text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) |
| 71 | + docs = text_splitter.split_documents(data) |
| 72 | + |
| 73 | + # Insert the documents in MongoDB Atlas Vector Search |
| 74 | + _ = MongoDBAtlasVectorSearch.from_documents( |
| 75 | + documents=docs, |
| 76 | + embedding=OpenAIEmbeddings(disallowed_special=()), |
| 77 | + collection=MONGODB_COLLECTION, |
| 78 | + index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME, |
| 79 | + ) |
| 80 | + return {} |
| 81 | + |
| 82 | + |
| 83 | +ingest = RunnableLambda(_ingest) |
0 commit comments