ask_pdf/main.py at main · gapilongo/ask_pdf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# Importing necessary libraries and modules
from PyPDF2 import PdfReader  # For reading PDF files
from langchain_openai import OpenAIEmbeddings, OpenAI  # For working with OpenAI embeddings and API
from langchain.text_splitter import CharacterTextSplitter  # For splitting text into manageable chunks
from langchain_community.vectorstores import FAISS  # For creating a vector store using FAISS
from langchain.chains.question_answering import load_qa_chain  # For loading a question-answering model
import os  # For accessing environment variables

# Retrieving the OpenAI API key from the environment variables
open_key = os.environ["OPENAI_API_KEY"]

# Opening the PDF file to be processed
pdf_file = PdfReader('yourpdf.pdf')

# Initializing a variable to store the extracted text
raw_text = ''

# Looping through each page in the PDF to extract text
for i, page in enumerate(pdf_file.pages):
    content = page.extract_text()  # Extracting text from the current page
    if content:
        raw_text += content  # Appending the extracted text to the raw_text variable

# Setting up a text splitter to break the raw text into smaller chunks
text_splitter = CharacterTextSplitter(
    separator='\n',  # Using newline character as separator between chunks
    chunk_size=800,  # Maximum number of characters in a chunk
    chunk_overlap=200,  # Number of characters to overlap between chunks
    length_function=len  # Function to calculate the length of text
)

# Splitting the raw text into manageable chunks
texts = text_splitter.split_text(raw_text)

# Initializing the embeddings model using OpenAI
embeddings = OpenAIEmbeddings()

# Creating a FAISS vector store from the text chunks using the embeddings
document_search = FAISS.from_texts(texts, embeddings)

# Loading a question-answering model chain with OpenAI
chain = load_qa_chain(OpenAI(), chain_type='stuff')

# Defining the query for the question-answering model
query = 'What is the agile method'

# Performing a similarity search in the document using the query
docs = document_search.similarity_search(query)

# Invoking the question-answering chain with the relevant documents and query
# then printing the answer
print(chain.invoke(input={'input_documents': docs, 'question': query})['output_text'])