Skip to content

Commit 83b034a

Browse files
committed
Improve DocSum file handling
Use temporary file only when necessary, and use aiofiles own functionality for that. Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
1 parent c0a826f commit 83b034a

File tree

1 file changed

+43
-34
lines changed

1 file changed

+43
-34
lines changed

DocSum/docsum.py

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# Copyright (C) 2024 Intel Corporation
22
# SPDX-License-Identifier: Apache-2.0
33

4-
import asyncio
54
import base64
65
import json
76
import os
@@ -56,15 +55,15 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
5655
return inputs
5756

5857

59-
def read_pdf(file):
58+
def read_pdf(file: str):
6059
from langchain.document_loaders import PyPDFLoader
6160

6261
loader = PyPDFLoader(file)
6362
docs = loader.load_and_split()
6463
return docs
6564

6665

67-
def encode_file_to_base64(file_path):
66+
async def encode_file_to_base64(f: UploadFile):
6867
"""Encode the content of a file to a base64 string.
6968
7069
Args:
@@ -73,8 +72,7 @@ def encode_file_to_base64(file_path):
7372
Returns:
7473
str: The base64 encoded string of the file content.
7574
"""
76-
with open(file_path, "rb") as f:
77-
base64_str = base64.b64encode(f.read()).decode("utf-8")
75+
base64_str = await base64.b64encode(f.read()).decode("utf-8")
7876
return base64_str
7977

8078

@@ -91,6 +89,7 @@ def video2audio(
9189
"""
9290
video_data = base64.b64decode(video_base64)
9391

92+
# TODO: why this processing is not async?
9493
uid = str(uuid.uuid4())
9594
temp_video_path = f"{uid}.mp4"
9695
temp_audio_path = f"{uid}.mp3"
@@ -116,29 +115,50 @@ def video2audio(
116115
return audio_base64
117116

118117

119-
def read_text_from_file(file, save_file_name):
118+
async def read_text_from_file(file: UploadFile):
119+
ctype = file.headers["content-type"]
120+
valid = (
121+
"text/plain",
122+
"application/pdf",
123+
"application/octet-stream",
124+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
125+
)
126+
127+
file_content = None
128+
if ctype not in valid:
129+
return file_content
130+
131+
import aiofiles
120132
import docx2txt
121133
from langchain.text_splitter import CharacterTextSplitter
122134

123135
# read text file
124-
if file.headers["content-type"] == "text/plain":
136+
if ctype == "text/plain":
125137
file.file.seek(0)
126138
content = file.file.read().decode("utf-8")
127-
# Split text
139+
# Split text to multiple documents
128140
text_splitter = CharacterTextSplitter()
129-
texts = text_splitter.split_text(content)
130-
# Create multiple documents
131-
file_content = texts
132-
# read pdf file
133-
elif file.headers["content-type"] == "application/pdf":
134-
documents = read_pdf(save_file_name)
135-
file_content = [doc.page_content for doc in documents]
136-
# read docx file
137-
elif (
138-
file.headers["content-type"] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
139-
or file.headers["content-type"] == "application/octet-stream"
140-
):
141-
file_content = docx2txt.process(save_file_name)
141+
return text_splitter.split_text(content)
142+
143+
# need a tmp file for rest
144+
async with aiofiles.tempfile.NamedTemporaryFile() as tmp:
145+
await tmp.write(await file.read())
146+
await tmp.flush()
147+
148+
# read pdf file
149+
if ctype == "application/pdf":
150+
documents = read_pdf(tmp.name)
151+
file_content = [doc.page_content for doc in documents]
152+
153+
# read docx file
154+
if ctype in (
155+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
156+
"application/octet-stream",
157+
):
158+
file_content = docx2txt.process(tmp.name)
159+
160+
# remove temp file
161+
await tmp.close()
142162

143163
return file_content
144164

@@ -240,25 +260,14 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
240260
file_summaries = []
241261
if files:
242262
for file in files:
243-
# Fix concurrency issue with the same file name
244-
# https://github.com/opea-project/GenAIExamples/issues/1279
245-
uid = str(uuid.uuid4())
246-
file_path = f"/tmp/{uid}"
247-
248-
import aiofiles
249-
250-
async with aiofiles.open(file_path, "wb") as f:
251-
await f.write(await file.read())
252263

253264
if data_type == "text":
254-
docs = read_text_from_file(file, file_path)
265+
docs = await read_text_from_file(file)
255266
elif data_type in ["audio", "video"]:
256-
docs = encode_file_to_base64(file_path)
267+
docs = await encode_file_to_base64(file)
257268
else:
258269
raise ValueError(f"Data type not recognized: {data_type}")
259270

260-
os.remove(file_path)
261-
262271
if isinstance(docs, list):
263272
file_summaries.extend(docs)
264273
else:

0 commit comments

Comments
 (0)