-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocument_processing.py
More file actions
119 lines (95 loc) · 4.14 KB
/
document_processing.py
File metadata and controls
119 lines (95 loc) · 4.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from config import config
def load_pdf_content(pdf_directory: str):
"""
Loads PDF document from a specified directory and returns them as a list of pages.
Args:
pdf_directory (str): The directory containing PDF files to ingest.
Returns:
list: A list of loaded document pages from the PDF files.
"""
print(f"Starting PDF document loading from '{pdf_directory}'...")
# Ensure the PDF directory exists
if not os.path.exists(pdf_directory):
print(f"Error: PDF directory '{pdf_directory}' not found.")
print("Please create this directory and place your PDF files inside.")
return []
all_pdf_docs = []
for filename in os.listdir(pdf_directory):
if filename.endswith(".pdf"):
filepath = os.path.join(pdf_directory, filename)
print(f"Loading PDF document: {filepath}")
try:
loader = PyPDFLoader(filepath)
pages = loader.load()
all_pdf_docs.extend(pages)
except Exception as e:
print(f"Error loading {filepath}: {e}")
if not all_pdf_docs:
print("No PDF documents found or loaded in the specified directory.")
else:
print(f"Loaded {len(all_pdf_docs)} pages from PDF documents.")
return all_pdf_docs
def ingest_all_documents(
pdf_directory_1: str = "data",
pdf_directory_2: str = "data",
persist_directory: str = "docs/chroma"
):
"""
Orchestrates the loading of PDF documents, combines them,
splits them into chunks, generates embeddings, and creates/persists a
Chroma vector database.
Args:
pdf_directory (str): The directory containing PDF files to ingest.
Defaults to "data".
persist_directory (str): The directory where the Chroma vector database
will be persisted. Defaults to "docs/chroma".
"""
print("\n--- Starting overall document ingestion process ---")
# 1. Load PDF content from book 1
pdf_docs_1 = load_pdf_content(pdf_directory_1)
# 2. Load PDF content from book 2
pdf_docs_2 = load_pdf_content(pdf_directory_2)
# Combine all loaded documents
combined_docs = pdf_docs_1 + pdf_docs_2
if not combined_docs:
print("No documents (PDF) were loaded. Exiting ingestion.")
return
print(f"\nTotal combined documents loaded: {len(combined_docs)}")
# 3. Document Splitters
chunk_size = config.CHUNK_SIZE
chunk_overlap = config.CHUNK_OVERLAP
print(f"Splitting documents into chunks (size: {chunk_size}, overlap: {chunk_overlap})...")
r_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunked_docs = r_splitter.split_documents(combined_docs)
print(f"Split documents into {len(chunked_docs)} chunks.")
# 4. Embeddings
model_name = config.EMBEDDING_MODEL_NAME
print(f"Initializing embeddings with model: {model_name}")
embeddings = HuggingFaceEmbeddings(model_name=model_name)
# 5. Create and Persist Vector DB
print(f"Creating and persisting Chroma DB to '{persist_directory}'...")
# Ensure the persist directory exists
os.makedirs(persist_directory, exist_ok=True)
vectordb = Chroma.from_documents(
documents=chunked_docs,
embedding=embeddings,
persist_directory=persist_directory
)
vectordb.persist() # Explicitly persist the database
print(f"Successfully processed {len(chunked_docs)} document chunks and persisted Chroma DB.")
print(f"You can now run your application using the data in '{persist_directory}'.")
print("--- Document ingestion process complete ---")
if __name__ == "__main__":
ingest_all_documents(
pdf_directory_1=config.PDF_SOURCE_DIRECTORY_1,
pdf_directory_2=config.PDF_SOURCE_DIRECTORY_2,
persist_directory=config.CHROMA_PERSIST_DIRECTORY
)