Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
.env
.env
.idea
.chroma
venv
.DS_Store
Binary file added db/chroma-collections.parquet
Binary file not shown.
Binary file added db/chroma-embeddings.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
66 changes: 66 additions & 0 deletions pdf-retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import gradio as gr
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationSummaryBufferMemory
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

llm = ChatOpenAI(temperature=0.1, model_name="gpt-4")

# Data Ingestion
pdf_loader = PyPDFDirectoryLoader('static/')

documents = pdf_loader.load()

# Chunk and Embeddings
text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
documents = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()

vectorstore = Chroma.from_documents(documents, embeddings, persist_directory="db")
vectorstore.persist()
vectordb = Chroma(persist_directory="db", embedding_function=embeddings)

memory = ConversationSummaryBufferMemory(
llm=llm,
max_token_limit=850,
output_key='answer',
memory_key='chat_history',
return_messages=True)

retriever = vectordb.as_retriever(
search_type="similarity",
search_kwargs={"k": 3})

# Initialise Langchain - Conversation Retrieval Chain
qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory,
get_chat_history=lambda h: h,
chain_type="stuff")

with gr.Blocks() as demo:
chatbot = gr.Chatbot()
msg = gr.Textbox()
clear = gr.Button("Clear")
chat_history = []


def user(user_message, history):
# Get response from QA chain
response = qa({"question": user_message, "chat_history": history})
# Append user message and response to chat history
history.append((user_message, response["answer"]))
# print(type(history[0]))
return gr.update(value=""), history


msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False)
clear.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
demo.launch(debug=True)
9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
unstructured
langchain
openai
pdf2image
chromadb
tiktoken
python-dotenv
pypdf
streamlit
Binary file added static/Aadhaar FAQ.pdf
Binary file not shown.
Binary file added static/Resident_HandBook_Hindi.pdf
Binary file not shown.
Binary file added static/UIDAI_Handbook English.pdf
Binary file not shown.
Binary file added static/recently_asked_questions.pdf
Binary file not shown.