11# Copyright (C) 2024 Intel Corporation
22# SPDX-License-Identifier: Apache-2.0
33
4- import asyncio
54import base64
65import os
76import subprocess
@@ -55,15 +54,15 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
5554 return inputs
5655
5756
58- def read_pdf (file ):
57+ def read_pdf (file : str ):
5958 from langchain .document_loaders import PyPDFLoader
6059
6160 loader = PyPDFLoader (file )
6261 docs = loader .load_and_split ()
6362 return docs
6463
6564
66- def encode_file_to_base64 (file_path ):
65+ async def encode_file_to_base64 (f : UploadFile ):
6766 """Encode the content of a file to a base64 string.
6867
6968 Args:
@@ -72,8 +71,7 @@ def encode_file_to_base64(file_path):
7271 Returns:
7372 str: The base64 encoded string of the file content.
7473 """
75- with open (file_path , "rb" ) as f :
76- base64_str = base64 .b64encode (f .read ()).decode ("utf-8" )
74+ base64_str = await base64 .b64encode (f .read ()).decode ("utf-8" )
7775 return base64_str
7876
7977
@@ -90,6 +88,7 @@ def video2audio(
9088 """
9189 video_data = base64 .b64decode (video_base64 )
9290
91+ # TODO: why this processing is not async?
9392 uid = str (uuid .uuid4 ())
9493 temp_video_path = f"{ uid } .mp4"
9594 temp_audio_path = f"{ uid } .mp3"
@@ -115,29 +114,50 @@ def video2audio(
115114 return audio_base64
116115
117116
118- def read_text_from_file (file , save_file_name ):
117+ async def read_text_from_file (file : UploadFile ):
118+ ctype = file .headers ["content-type" ]
119+ valid = (
120+ "text/plain" ,
121+ "application/pdf" ,
122+ "application/octet-stream" ,
123+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ,
124+ )
125+
126+ file_content = None
127+ if ctype not in valid :
128+ return file_content
129+
130+ import aiofiles
119131 import docx2txt
120132 from langchain .text_splitter import CharacterTextSplitter
121133
122134 # read text file
123- if file . headers [ "content-type" ] == "text/plain" :
135+ if ctype == "text/plain" :
124136 file .file .seek (0 )
125137 content = file .file .read ().decode ("utf-8" )
126- # Split text
138+ # Split text to multiple documents
127139 text_splitter = CharacterTextSplitter ()
128- texts = text_splitter .split_text (content )
129- # Create multiple documents
130- file_content = texts
131- # read pdf file
132- elif file .headers ["content-type" ] == "application/pdf" :
133- documents = read_pdf (save_file_name )
134- file_content = [doc .page_content for doc in documents ]
135- # read docx file
136- elif (
137- file .headers ["content-type" ] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
138- or file .headers ["content-type" ] == "application/octet-stream"
139- ):
140- file_content = docx2txt .process (save_file_name )
140+ return text_splitter .split_text (content )
141+
142+ # need a tmp file for rest
143+ async with aiofiles .tempfile .NamedTemporaryFile () as tmp :
144+ await tmp .write (await file .read ())
145+ await tmp .flush ()
146+
147+ # read pdf file
148+ if ctype == "application/pdf" :
149+ documents = read_pdf (tmp .name )
150+ file_content = [doc .page_content for doc in documents ]
151+
152+ # read docx file
153+ if ctype in (
154+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ,
155+ "application/octet-stream" ,
156+ ):
157+ file_content = docx2txt .process (tmp .name )
158+
159+ # remove temp file
160+ await tmp .close ()
141161
142162 return file_content
143163
@@ -201,25 +221,14 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
201221 file_summaries = []
202222 if files :
203223 for file in files :
204- # Fix concurrency issue with the same file name
205- # https://github.com/opea-project/GenAIExamples/issues/1279
206- uid = str (uuid .uuid4 ())
207- file_path = f"/tmp/{ uid } "
208-
209- import aiofiles
210-
211- async with aiofiles .open (file_path , "wb" ) as f :
212- await f .write (await file .read ())
213224
214225 if data_type == "text" :
215- docs = read_text_from_file (file , file_path )
226+ docs = await read_text_from_file (file )
216227 elif data_type in ["audio" , "video" ]:
217- docs = encode_file_to_base64 (file_path )
228+ docs = await encode_file_to_base64 (file )
218229 else :
219230 raise ValueError (f"Data type not recognized: { data_type } " )
220231
221- os .remove (file_path )
222-
223232 if isinstance (docs , list ):
224233 file_summaries .extend (docs )
225234 else :
0 commit comments