diff --git a/main.py b/main.py new file mode 100644 index 0000000..3bde278 --- /dev/null +++ b/main.py @@ -0,0 +1,44 @@ +from openlrc import LRCer + +if __name__ == '__main__': + lrcer = LRCer(chatbot_model='gpt-4o', + base_url_config={'openai': 'https://api.gptsapi.net/v1'}, + # device='cpu', + # compute_type='float32', + #is_force_glossary_used=True + ) + + lrcer.run(['./data/test.mp3'], target_lang='zh-cn', skip_trans=False,video_understanding = True, sampling_frequency=5) + # Generate translated ./data/test_audio.lrc and ./data/test_video.srt + + # Use glossary to improve translation + # lrcer = LRCer(glossary='/data/aoe4-glossary.yaml') + # + # # To skip translation process + # #lrcer.run('./data/test.mp3', target_lang='en', skip_trans=True) + # + # # Change asr_options or vad_options, check openlrc.defaults for details + # vad_options = {"threshold": 0.1} + # lrcer = LRCer(vad_options=vad_options) + # lrcer.run('./data/test.mp3', target_lang='zh-cn') + # + # # Enhance the audio using noise suppression (consume more time). + # lrcer.run('./data/test.mp3', target_lang='zh-cn', noise_suppress=True) + # + # # Change the LLM model for translation + # lrcer = LRCer(chatbot_model='claude-3-sonnet-20240229') + # lrcer.run('./data/test.mp3', target_lang='zh-cn') + # + # # Clear temp folder after processing done + # lrcer.run('./data/test.mp3', target_lang='zh-cn', clear_temp=True) + # + # # Change base_url + # lrcer = LRCer(base_url_config={'openai': 'https://api.g4f.icu/v1', + # 'anthropic': 'https://example/api'}) + # + # # Route model to arbitrary Chatbot SDK + # lrcer = LRCer(chatbot_model='openai: claude-3-sonnet-20240229', + # base_url_config={'openai': 'https://api.g4f.icu/v1/'}) + # + # # Bilingual subtitle + # lrcer.run('./data/test.mp3', target_lang='zh-cn', bilingual_sub=True) \ No newline at end of file diff --git a/openlrc/openlrc.py b/openlrc/openlrc.py index 17427aa..7733188 100644 --- a/openlrc/openlrc.py +++ b/openlrc/openlrc.py @@ -26,6 +26,8 @@ from openlrc.utils import Timer, extend_filename, get_audio_duration, format_timestamp, extract_audio, \ get_file_type +from openlrc.video_bot import videoBot + class LRCer: """ @@ -86,6 +88,7 @@ def __init__(self, whisper_model: str = 'large-v3', compute_type: str = 'float16 self.transcriber = Transcriber(model_name=whisper_model, compute_type=compute_type, device=device, asr_options=self.asr_options, vad_options=self.vad_options) self.transcribed_paths = [] + self.transcribed_origin_paths = [] @staticmethod def parse_glossary(glossary: Union[dict, str, Path]): @@ -119,6 +122,7 @@ def produce_transcriptions(self, transcription_queue, audio_paths, src_lang): """ for audio_path in audio_paths: transcribed_path = extend_filename(audio_path, '_transcribed').with_suffix('.json') + self.transcribed_origin_paths.append(transcribed_path) if not transcribed_path.exists(): with Timer('Transcription process'): logger.info( @@ -218,6 +222,8 @@ def handle_bilingual_subtitles(transcribed_path, base_name, transcribed_opt_sub, logger.debug('Translation worker waiting transcription...') transcribed_path = transcription_queue.get() + logger.info(f'Got transcription: {transcribed_path}') + if transcribed_path is None: transcription_queue.put(None) logger.debug('Translation worker finished.') @@ -298,7 +304,7 @@ def _translate(self, audio_name, target_lang, transcribed_opt_sub, translated_pa return final_subtitle def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optional[str] = None, target_lang='zh-cn', - skip_trans=False, noise_suppress=False, bilingual_sub=False, clear_temp=False) -> List[str]: + skip_trans=False, noise_suppress=False, bilingual_sub=False, clear_temp=False,video_understanding=False,sampling_frequency=0) -> List[str]: """ Run the entire transcription and translation process. @@ -357,6 +363,8 @@ def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optiona if isinstance(paths, str) or isinstance(paths, Path): paths = [paths] + video_path = paths + paths = list(map(Path, paths)) audio_paths = self.pre_process(paths, noise_suppress=noise_suppress) @@ -384,6 +392,15 @@ def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optiona logger.info('Clearing temporary folder...') self.clear_temp_files(audio_paths) + + + if video_understanding: + assert sampling_frequency > 0, f"sampling_frequency should be greater than 0" + assert len(video_path) == len(self.transcribed_origin_paths), f"the number of video files and transcribed files do not match" + for i in range(len(video_path)): + Bot = videoBot(video_path[i],self.transcribed_origin_paths[i],sampling_frequency) + Bot.inference() + return self.transcribed_paths def clear_temp_files(self, paths): diff --git a/openlrc/video_bot.py b/openlrc/video_bot.py new file mode 100644 index 0000000..7f86e26 --- /dev/null +++ b/openlrc/video_bot.py @@ -0,0 +1,103 @@ +import base64 +from PIL import Image +import numpy as np +from decord import VideoReader, cpu +from openlrc.logger import logger +import anthropic +import os +import json + +os.environ['OPENAI_API_KEY'] = "sk-OgF4d947838dcea5e04954eb578374b93715da50b196XXT9" + +class videoBot: + def __init__(self, video_path, text_path,num_frequence=5): + self.video_path = video_path + self.text_path = text_path + self.num_frequence = num_frequence + + def encode_image_base64(self,img: Image.Image): + from io import BytesIO + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + + def get_index(self, fps, max_frame): + # 每 interval 秒采一帧,对应的帧间隔 + step = int(self.num_frequence * fps) + frame_indices = list(range(0, max_frame + 1, step)) + return frame_indices + + def load_video_frames(self, video_path, bound=None): + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) + max_frame = len(vr) - 1 + fps = float(vr.get_avg_fps()) + + frame_indices = self.get_index(fps, max_frame) + + imgs = [] + for idx in frame_indices: + img = Image.fromarray(vr[idx].asnumpy()).convert('RGB') + imgs.append(img) + return imgs + + def inference(self): + logger.info("start understanding") + client = anthropic.Anthropic( + base_url= "https://api.gptsapi.net", + api_key=os.environ['OPENAI_API_KEY'] + ) + + # 加载视频帧 + imgs = self.load_video_frames(self.video_path) + + # 构造内容块(含图像 + 提问) + content_blocks = [] + for i, img in enumerate(imgs): + base64_img = self.encode_image_base64(img) + content_blocks.append({ + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": base64_img + } + }) + + + with open(self.text_path, "r", encoding="utf-8") as json_file: + json_data = json.load(json_file) + + texts = [segment["text"] for segment in json_data.get("segments", [])] + combined_text = " ".join(texts) + + + description_text = f'''the following is the subtitle of the video: {combined_text}, + please give a detailed description of the video content based on the subtitle and the image.''' + + # 添加文字部分 + content_blocks.append({ + "type": "text", + "text": description_text # 将 JSON 文件中的内容添加到 text 字段 + }) + + # Claude 请求 + response = client.messages.create( + model="claude-3-sonnet-20240229", # 或 claude-3-opus-20240229 + max_tokens=1000, + temperature=0.5, + messages=[{ + "role": "user", + "content": content_blocks + }] + ) + + output_text = response.content[0].text + + video_prefix = os.path.splitext(os.path.basename(self.video_path))[0] + + output_file_path = f"./tests/data/{video_prefix}_understanding.txt" + + with open(output_file_path, "w", encoding="utf-8") as file: + file.write(output_text) + + logger.info(f"video understanding saved to {output_file_path}")