PDF-Chatbot/insert.py at main · Invictus108/PDF-Chatbot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import sys
from pdf_parse import parse_pdf, image_to_base64
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai
from langchain_community.embeddings.openai import OpenAIEmbeddings
from pinecone import Pinecone
import aiohttp
import asyncio
import uuid
import instructor
from pydantic import BaseModel
import json
from openai import OpenAI

def insert_pdf(pdf_path, openai_api_key, pinecone_api_key):
    # parse pdf
    text, images = parse_pdf(pdf_path, pages=True)

    # get keys
    os.environ['OPENAI_API_KEY'] = openai_api_key
    os.environ['PINECONE_API_KEY'] = pinecone_api_key


    # Initialize the instructor client
    client = instructor.from_openai(OpenAI())

    # define embeddings
    embedding_model = OpenAIEmbeddings(
        model="text-embedding-3-small"
    )

    # define make json funtion for formatting data for instructor client
    def make_json(text, images, question):
        messages = [
            {
            "role": "user",
            "content": [
                {
                "type": "text",
                "text": f"Use this context: {text if text else "No context in this query"}, alongside the images to answer this question {question}"
                },

            ]
            }
        ]

        for image in images:
            messages[0]['content'].append({
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_to_base64(image)}"
                    },

            })

        return messages


    # template for summary
    class Summary(BaseModel):
        summary: str

    # get original summary from chatGPT. Label Images and Tables
    query = "Summerize the PDF"
    message = make_json(text, images, query)
    response = client.chat.completions.create(
                model="gpt-4o-mini",
                response_model=Summary,
                messages=message
            )
    summary = response.summary + "\n"

    # chunk text data for embeddings
    def chunk_data(text, chunk_size=1000, chunk_overlap=200):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        chunks = text_splitter.split_text(text)
        return chunks

    split_docs = chunk_data(text)

    # template for image summaries
    class Image_Response(BaseModel):
        summary: str
        question_1: str
        question_2: str
        question_3: str
        question_4: str

    # async get image summaries
    async def get_image_summaries(images):
        tasks = []
        for image in images:
            query = "Provide a short summary of the image and 4 specific questions about data in the presentation that would be asked by a business analyst. Seperate each individual entry with four dashes (----). Ignore text and only focus on images and tables."
            message = make_json(None, [image], query)
            tasks.append(fetch_summary(message))

        return await asyncio.gather(*tasks)


    async def fetch_summary(messages):
        # Using the instructor client to extract structured data
        response = await asyncio.to_thread(
            client.chat.completions.create,
            model="gpt-4o-mini",
            response_model=Image_Response,
            messages=messages
        )

        return [response.summary, response.question_1, response.question_2, response.question_3, response.question_4]


    image_summaries = asyncio.run(get_image_summaries(images))

    assert len(image_summaries) == len(images)

    # get embeddings
    def get_embeddings(chunks):
        embeddings = []
        for chunk in chunks:
            embedding = embedding_model.embed_query(chunk)
            embeddings.append(embedding)
        return embeddings

    # get embeddings
    text_embeddings = get_embeddings(split_docs)
    image_embeddings = []
    for part in image_summaries:
        image_embeddings.append(get_embeddings(part))

    # initialize
    index_name = "pdf-chatbot"
    pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
    index = pc.Index(index_name)

    # insert vectors async
    async def async_upsert(index, vectors):
        index.upsert(vectors)

    # funtion for inserting vectors
    async def insert_vectors(index, embeddings, metadata):
        tasks = []
        assert len(embeddings) == len(metadata)
        for i in range(len(metadata)):
            if not isinstance(embeddings[i][0], list):
                vectors=[
                        {
                            "id" : str(uuid.uuid1()),
                            "values":embeddings[i],
                            "metadata":metadata[i]
                        }

                    ]
                tasks.append(async_upsert(index, vectors))
            else:
                for j in range(len(embeddings[i])):
                    vectors=[
                        {
                            "id" : str(uuid.uuid1()),
                            "values":embeddings[i][j],
                            "metadata":metadata[i]
                        }

                    ]
                    tasks.append(async_upsert(index, vectors))
        await asyncio.gather(*tasks)

    async def insert_vectors_main(index, embeddings, metadata):
        await insert_vectors(index, embeddings, metadata)


    # insert text vectors
    text_metadata = [{"source": pdf_path, "type": "text", "content": i} for i in split_docs]
    asyncio.run(insert_vectors_main(index, text_embeddings, text_metadata))

    # insert image vectors
    images_metadata = [{"source": pdf_path + ":" + str(i), "type": "image", "content": i} for i in images]
    asyncio.run(insert_vectors_main(index, image_embeddings, images_metadata))

    return summary