11# Copyright (C) 2024 Intel Corporation
22# SPDX-License-Identifier: Apache-2.0
33
4- import asyncio
54import base64
65import json
76import os
@@ -56,15 +55,15 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
5655 return inputs
5756
5857
59- def read_pdf (file ):
58+ def read_pdf (file : str ):
6059 from langchain .document_loaders import PyPDFLoader
6160
6261 loader = PyPDFLoader (file )
6362 docs = loader .load_and_split ()
6463 return docs
6564
6665
67- def encode_file_to_base64 (file_path ):
66+ async def encode_file_to_base64 (f : UploadFile ):
6867 """Encode the content of a file to a base64 string.
6968
7069 Args:
@@ -73,8 +72,7 @@ def encode_file_to_base64(file_path):
7372 Returns:
7473 str: The base64 encoded string of the file content.
7574 """
76- with open (file_path , "rb" ) as f :
77- base64_str = base64 .b64encode (f .read ()).decode ("utf-8" )
75+ base64_str = await base64 .b64encode (f .read ()).decode ("utf-8" )
7876 return base64_str
7977
8078
@@ -91,6 +89,7 @@ def video2audio(
9189 """
9290 video_data = base64 .b64decode (video_base64 )
9391
92+ # TODO: why this processing is not async?
9493 uid = str (uuid .uuid4 ())
9594 temp_video_path = f"{ uid } .mp4"
9695 temp_audio_path = f"{ uid } .mp3"
@@ -116,29 +115,50 @@ def video2audio(
116115 return audio_base64
117116
118117
119- def read_text_from_file (file , save_file_name ):
118+ async def read_text_from_file (file : UploadFile ):
119+ ctype = file .headers ["content-type" ]
120+ valid = (
121+ "text/plain" ,
122+ "application/pdf" ,
123+ "application/octet-stream" ,
124+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ,
125+ )
126+
127+ file_content = None
128+ if ctype not in valid :
129+ return file_content
130+
131+ import aiofiles
120132 import docx2txt
121133 from langchain .text_splitter import CharacterTextSplitter
122134
123135 # read text file
124- if file . headers [ "content-type" ] == "text/plain" :
136+ if ctype == "text/plain" :
125137 file .file .seek (0 )
126138 content = file .file .read ().decode ("utf-8" )
127- # Split text
139+ # Split text to multiple documents
128140 text_splitter = CharacterTextSplitter ()
129- texts = text_splitter .split_text (content )
130- # Create multiple documents
131- file_content = texts
132- # read pdf file
133- elif file .headers ["content-type" ] == "application/pdf" :
134- documents = read_pdf (save_file_name )
135- file_content = [doc .page_content for doc in documents ]
136- # read docx file
137- elif (
138- file .headers ["content-type" ] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
139- or file .headers ["content-type" ] == "application/octet-stream"
140- ):
141- file_content = docx2txt .process (save_file_name )
141+ return text_splitter .split_text (content )
142+
143+ # need a tmp file for rest
144+ async with aiofiles .tempfile .NamedTemporaryFile () as tmp :
145+ await tmp .write (await file .read ())
146+ await tmp .flush ()
147+
148+ # read pdf file
149+ if ctype == "application/pdf" :
150+ documents = read_pdf (tmp .name )
151+ file_content = [doc .page_content for doc in documents ]
152+
153+ # read docx file
154+ if ctype in (
155+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ,
156+ "application/octet-stream" ,
157+ ):
158+ file_content = docx2txt .process (tmp .name )
159+
160+ # remove temp file
161+ await tmp .close ()
142162
143163 return file_content
144164
@@ -240,25 +260,14 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
240260 file_summaries = []
241261 if files :
242262 for file in files :
243- # Fix concurrency issue with the same file name
244- # https://github.com/opea-project/GenAIExamples/issues/1279
245- uid = str (uuid .uuid4 ())
246- file_path = f"/tmp/{ uid } "
247-
248- import aiofiles
249-
250- async with aiofiles .open (file_path , "wb" ) as f :
251- await f .write (await file .read ())
252263
253264 if data_type == "text" :
254- docs = read_text_from_file (file , file_path )
265+ docs = await read_text_from_file (file )
255266 elif data_type in ["audio" , "video" ]:
256- docs = encode_file_to_base64 (file_path )
267+ docs = await encode_file_to_base64 (file )
257268 else :
258269 raise ValueError (f"Data type not recognized: { data_type } " )
259270
260- os .remove (file_path )
261-
262271 if isinstance (docs , list ):
263272 file_summaries .extend (docs )
264273 else :
0 commit comments