zh-plus · jian-77 · May 18, 2025
diff --git a/main.py b/main.py
@@ -0,0 +1,44 @@
+from openlrc import LRCer
+
+if __name__ == '__main__':
+    lrcer = LRCer(chatbot_model='gpt-4o',
+                  base_url_config={'openai': 'https://api.gptsapi.net/v1'},
+                #   device='cpu',
+                #   compute_type='float32',
+                  #is_force_glossary_used=True
+                  )
+
+    lrcer.run(['./data/test.mp3'], target_lang='zh-cn', skip_trans=False,video_understanding = True, sampling_frequency=5)
+    # Generate translated ./data/test_audio.lrc and ./data/test_video.srt
+
+    # Use glossary to improve translation
+    # lrcer = LRCer(glossary='/data/aoe4-glossary.yaml')
+    #
+    # # To skip translation process
+    # #lrcer.run('./data/test.mp3', target_lang='en', skip_trans=True)
+    #
+    # # Change asr_options or vad_options, check openlrc.defaults for details
+    # vad_options = {"threshold": 0.1}
+    # lrcer = LRCer(vad_options=vad_options)
+    # lrcer.run('./data/test.mp3', target_lang='zh-cn')
+    #
+    # # Enhance the audio using noise suppression (consume more time).
+    # lrcer.run('./data/test.mp3', target_lang='zh-cn', noise_suppress=True)
+    #
+    # # Change the LLM model for translation
+    # lrcer = LRCer(chatbot_model='claude-3-sonnet-20240229')
+    # lrcer.run('./data/test.mp3', target_lang='zh-cn')
+    #
+    # # Clear temp folder after processing done
+    # lrcer.run('./data/test.mp3', target_lang='zh-cn', clear_temp=True)
+    #
+    # # Change base_url
+    # lrcer = LRCer(base_url_config={'openai': 'https://api.g4f.icu/v1',
+    #                                'anthropic': 'https://example/api'})
+    #
+    # # Route model to arbitrary Chatbot SDK
+    # lrcer = LRCer(chatbot_model='openai: claude-3-sonnet-20240229',
+    #               base_url_config={'openai': 'https://api.g4f.icu/v1/'})
+    #
+    # # Bilingual subtitle
+    # lrcer.run('./data/test.mp3', target_lang='zh-cn', bilingual_sub=True)
diff --git a/openlrc/openlrc.py b/openlrc/openlrc.py
@@ -26,6 +26,8 @@
 from openlrc.utils import Timer, extend_filename, get_audio_duration, format_timestamp, extract_audio, \
     get_file_type
 
+from openlrc.video_bot import videoBot
+
 
 class LRCer:
     """
@@ -86,6 +88,7 @@ def __init__(self, whisper_model: str = 'large-v3', compute_type: str = 'float16
         self.transcriber = Transcriber(model_name=whisper_model, compute_type=compute_type, device=device,
                                        asr_options=self.asr_options, vad_options=self.vad_options)
         self.transcribed_paths = []
+        self.transcribed_origin_paths = []
 
     @staticmethod
     def parse_glossary(glossary: Union[dict, str, Path]):
@@ -119,6 +122,7 @@ def produce_transcriptions(self, transcription_queue, audio_paths, src_lang):
         """
         for audio_path in audio_paths:
             transcribed_path = extend_filename(audio_path, '_transcribed').with_suffix('.json')
+            self.transcribed_origin_paths.append(transcribed_path)
             if not transcribed_path.exists():
                 with Timer('Transcription process'):
                     logger.info(
@@ -218,6 +222,8 @@ def handle_bilingual_subtitles(transcribed_path, base_name, transcribed_opt_sub,
             logger.debug('Translation worker waiting transcription...')
             transcribed_path = transcription_queue.get()
 
+            logger.info(f'Got transcription: {transcribed_path}')
+
             if transcribed_path is None:
                 transcription_queue.put(None)
                 logger.debug('Translation worker finished.')
@@ -298,7 +304,7 @@ def _translate(self, audio_name, target_lang, transcribed_opt_sub, translated_pa
         return final_subtitle
 
     def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optional[str] = None, target_lang='zh-cn',
-            skip_trans=False, noise_suppress=False, bilingual_sub=False, clear_temp=False) -> List[str]:
+            skip_trans=False, noise_suppress=False, bilingual_sub=False, clear_temp=False,video_understanding=False,sampling_frequency=0) -> List[str]:
         """
         Run the entire transcription and translation process.
 
@@ -357,6 +363,8 @@ def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optiona
         if isinstance(paths, str) or isinstance(paths, Path):
             paths = [paths]
 
+        video_path = paths
+
         paths = list(map(Path, paths))
 
         audio_paths = self.pre_process(paths, noise_suppress=noise_suppress)
@@ -384,6 +392,15 @@ def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optiona
             logger.info('Clearing temporary folder...')
             self.clear_temp_files(audio_paths)
 
+
+
+        if video_understanding:
+            assert sampling_frequency > 0, f"sampling_frequency should be greater than 0"
+            assert len(video_path) == len(self.transcribed_origin_paths), f"the number of video files and transcribed files do not match"
+            for i in range(len(video_path)):
+                Bot = videoBot(video_path[i],self.transcribed_origin_paths[i],sampling_frequency)
+                Bot.inference()
+
         return self.transcribed_paths
 
     def clear_temp_files(self, paths):

diff --git a/openlrc/video_bot.py b/openlrc/video_bot.py
@@ -0,0 +1,103 @@
+import base64
+from PIL import Image
+import numpy as np
+from decord import VideoReader, cpu
+from openlrc.logger import logger
+import anthropic
+import os
+import json
+
+os.environ['OPENAI_API_KEY'] = "sk-OgF4d947838dcea5e04954eb578374b93715da50b196XXT9"
+
+class videoBot:
+    def __init__(self, video_path, text_path,num_frequence=5):
+        self.video_path = video_path
+        self.text_path = text_path
+        self.num_frequence = num_frequence
+
+    def encode_image_base64(self,img: Image.Image):
+        from io import BytesIO
+        buffered = BytesIO()
+        img.save(buffered, format="JPEG")
+        return base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+    def get_index(self, fps, max_frame):
+        # 每 interval 秒采一帧，对应的帧间隔
+        step = int(self.num_frequence * fps)
+        frame_indices = list(range(0, max_frame + 1, step))
+        return frame_indices
+
+    def load_video_frames(self, video_path, bound=None):
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        max_frame = len(vr) - 1
+        fps = float(vr.get_avg_fps())
+
+        frame_indices = self.get_index(fps, max_frame)
+
+        imgs = []
+        for idx in frame_indices:
+            img = Image.fromarray(vr[idx].asnumpy()).convert('RGB')
+            imgs.append(img)
+        return imgs 
+
+    def inference(self):
+        logger.info("start understanding")
+        client = anthropic.Anthropic(
+            base_url= "https://api.gptsapi.net",
+            api_key=os.environ['OPENAI_API_KEY']
+        )
+
+        # 加载视频帧
+        imgs = self.load_video_frames(self.video_path)
+
+        # 构造内容块（含图像 + 提问）
+        content_blocks = []
+        for i, img in enumerate(imgs):
+            base64_img = self.encode_image_base64(img)
+            content_blocks.append({
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": "image/jpeg",
+                    "data": base64_img
+                }
+            })
+
+
+        with open(self.text_path, "r", encoding="utf-8") as json_file:
+            json_data = json.load(json_file)
+
+        texts = [segment["text"] for segment in json_data.get("segments", [])]
+        combined_text = " ".join(texts)
+
+
+        description_text = f'''the following is the subtitle of the video: {combined_text}, 
+        please give a detailed description of the video content based on the subtitle and the image.'''
+
+        # 添加文字部分
+        content_blocks.append({
+            "type": "text",
+            "text": description_text  # 将 JSON 文件中的内容添加到 text 字段
+        })
+
+        # Claude 请求
+        response = client.messages.create(
+            model="claude-3-sonnet-20240229",  # 或 claude-3-opus-20240229
+            max_tokens=1000,
+            temperature=0.5,
+            messages=[{
+                "role": "user",
+                "content": content_blocks
+            }]
+        )
+
+        output_text = response.content[0].text
+
+        video_prefix = os.path.splitext(os.path.basename(self.video_path))[0]
+
+        output_file_path = f"./tests/data/{video_prefix}_understanding.txt"
+
+        with open(output_file_path, "w", encoding="utf-8") as file:
+            file.write(output_text)
+
+        logger.info(f"video understanding saved to {output_file_path}")