Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from openlrc import LRCer

if __name__ == '__main__':
lrcer = LRCer(chatbot_model='gpt-4o',
base_url_config={'openai': 'https://api.gptsapi.net/v1'},
# device='cpu',
# compute_type='float32',
#is_force_glossary_used=True
)

lrcer.run(['./data/test.mp3'], target_lang='zh-cn', skip_trans=False,video_understanding = True, sampling_frequency=5)
# Generate translated ./data/test_audio.lrc and ./data/test_video.srt

# Use glossary to improve translation
# lrcer = LRCer(glossary='/data/aoe4-glossary.yaml')
#
# # To skip translation process
# #lrcer.run('./data/test.mp3', target_lang='en', skip_trans=True)
#
# # Change asr_options or vad_options, check openlrc.defaults for details
# vad_options = {"threshold": 0.1}
# lrcer = LRCer(vad_options=vad_options)
# lrcer.run('./data/test.mp3', target_lang='zh-cn')
#
# # Enhance the audio using noise suppression (consume more time).
# lrcer.run('./data/test.mp3', target_lang='zh-cn', noise_suppress=True)
#
# # Change the LLM model for translation
# lrcer = LRCer(chatbot_model='claude-3-sonnet-20240229')
# lrcer.run('./data/test.mp3', target_lang='zh-cn')
#
# # Clear temp folder after processing done
# lrcer.run('./data/test.mp3', target_lang='zh-cn', clear_temp=True)
#
# # Change base_url
# lrcer = LRCer(base_url_config={'openai': 'https://api.g4f.icu/v1',
# 'anthropic': 'https://example/api'})
#
# # Route model to arbitrary Chatbot SDK
# lrcer = LRCer(chatbot_model='openai: claude-3-sonnet-20240229',
# base_url_config={'openai': 'https://api.g4f.icu/v1/'})
#
# # Bilingual subtitle
# lrcer.run('./data/test.mp3', target_lang='zh-cn', bilingual_sub=True)
19 changes: 18 additions & 1 deletion openlrc/openlrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from openlrc.utils import Timer, extend_filename, get_audio_duration, format_timestamp, extract_audio, \
get_file_type

from openlrc.video_bot import videoBot


class LRCer:
"""
Expand Down Expand Up @@ -86,6 +88,7 @@ def __init__(self, whisper_model: str = 'large-v3', compute_type: str = 'float16
self.transcriber = Transcriber(model_name=whisper_model, compute_type=compute_type, device=device,
asr_options=self.asr_options, vad_options=self.vad_options)
self.transcribed_paths = []
self.transcribed_origin_paths = []

@staticmethod
def parse_glossary(glossary: Union[dict, str, Path]):
Expand Down Expand Up @@ -119,6 +122,7 @@ def produce_transcriptions(self, transcription_queue, audio_paths, src_lang):
"""
for audio_path in audio_paths:
transcribed_path = extend_filename(audio_path, '_transcribed').with_suffix('.json')
self.transcribed_origin_paths.append(transcribed_path)
if not transcribed_path.exists():
with Timer('Transcription process'):
logger.info(
Expand Down Expand Up @@ -218,6 +222,8 @@ def handle_bilingual_subtitles(transcribed_path, base_name, transcribed_opt_sub,
logger.debug('Translation worker waiting transcription...')
transcribed_path = transcription_queue.get()

logger.info(f'Got transcription: {transcribed_path}')

if transcribed_path is None:
transcription_queue.put(None)
logger.debug('Translation worker finished.')
Expand Down Expand Up @@ -298,7 +304,7 @@ def _translate(self, audio_name, target_lang, transcribed_opt_sub, translated_pa
return final_subtitle

def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optional[str] = None, target_lang='zh-cn',
skip_trans=False, noise_suppress=False, bilingual_sub=False, clear_temp=False) -> List[str]:
skip_trans=False, noise_suppress=False, bilingual_sub=False, clear_temp=False,video_understanding=False,sampling_frequency=0) -> List[str]:
"""
Run the entire transcription and translation process.

Expand Down Expand Up @@ -357,6 +363,8 @@ def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optiona
if isinstance(paths, str) or isinstance(paths, Path):
paths = [paths]

video_path = paths

paths = list(map(Path, paths))

audio_paths = self.pre_process(paths, noise_suppress=noise_suppress)
Expand Down Expand Up @@ -384,6 +392,15 @@ def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optiona
logger.info('Clearing temporary folder...')
self.clear_temp_files(audio_paths)



if video_understanding:
assert sampling_frequency > 0, f"sampling_frequency should be greater than 0"
assert len(video_path) == len(self.transcribed_origin_paths), f"the number of video files and transcribed files do not match"
for i in range(len(video_path)):
Bot = videoBot(video_path[i],self.transcribed_origin_paths[i],sampling_frequency)
Bot.inference()

return self.transcribed_paths

def clear_temp_files(self, paths):
Expand Down
103 changes: 103 additions & 0 deletions openlrc/video_bot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import base64
from PIL import Image
import numpy as np
from decord import VideoReader, cpu
from openlrc.logger import logger
import anthropic
import os
import json

os.environ['OPENAI_API_KEY'] = "sk-OgF4d947838dcea5e04954eb578374b93715da50b196XXT9"

class videoBot:
def __init__(self, video_path, text_path,num_frequence=5):
self.video_path = video_path
self.text_path = text_path
self.num_frequence = num_frequence

def encode_image_base64(self,img: Image.Image):
from io import BytesIO
buffered = BytesIO()
img.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")

def get_index(self, fps, max_frame):
# 每 interval 秒采一帧,对应的帧间隔
step = int(self.num_frequence * fps)
frame_indices = list(range(0, max_frame + 1, step))
return frame_indices

def load_video_frames(self, video_path, bound=None):
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())

frame_indices = self.get_index(fps, max_frame)

imgs = []
for idx in frame_indices:
img = Image.fromarray(vr[idx].asnumpy()).convert('RGB')
imgs.append(img)
return imgs

def inference(self):
logger.info("start understanding")
client = anthropic.Anthropic(
base_url= "https://api.gptsapi.net",
api_key=os.environ['OPENAI_API_KEY']
)

# 加载视频帧
imgs = self.load_video_frames(self.video_path)

# 构造内容块(含图像 + 提问)
content_blocks = []
for i, img in enumerate(imgs):
base64_img = self.encode_image_base64(img)
content_blocks.append({
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": base64_img
}
})


with open(self.text_path, "r", encoding="utf-8") as json_file:
json_data = json.load(json_file)

texts = [segment["text"] for segment in json_data.get("segments", [])]
combined_text = " ".join(texts)


description_text = f'''the following is the subtitle of the video: {combined_text},
please give a detailed description of the video content based on the subtitle and the image.'''

# 添加文字部分
content_blocks.append({
"type": "text",
"text": description_text # 将 JSON 文件中的内容添加到 text 字段
})

# Claude 请求
response = client.messages.create(
model="claude-3-sonnet-20240229", # 或 claude-3-opus-20240229
max_tokens=1000,
temperature=0.5,
messages=[{
"role": "user",
"content": content_blocks
}]
)

output_text = response.content[0].text

video_prefix = os.path.splitext(os.path.basename(self.video_path))[0]

output_file_path = f"./tests/data/{video_prefix}_understanding.txt"

with open(output_file_path, "w", encoding="utf-8") as file:
file.write(output_text)

logger.info(f"video understanding saved to {output_file_path}")