diff --git a/openlrc/__init__.py b/openlrc/__init__.py index 056d690..34b9cdb 100644 --- a/openlrc/__init__.py +++ b/openlrc/__init__.py @@ -1,8 +1,9 @@ # Copyright (C) 2024. Hao Zheng # All rights reserved. +from openlrc.config import TranscriptionConfig, TranslationConfig from openlrc.models import ModelConfig, ModelProvider, list_chatbot_models -from openlrc.openlrc import LRCer, TranscriptionConfig, TranslationConfig +from openlrc.openlrc import LRCer __all__ = ("LRCer", "TranscriptionConfig", "TranslationConfig", "ModelConfig", "list_chatbot_models", "ModelProvider") __version__ = "1.6.1" diff --git a/openlrc/config.py b/openlrc/config.py new file mode 100644 index 0000000..d2f05ff --- /dev/null +++ b/openlrc/config.py @@ -0,0 +1,58 @@ +# Copyright (C) 2025. Hao Zheng +# All rights reserved. + +from dataclasses import dataclass +from pathlib import Path + +from openlrc.models import ModelConfig + + +@dataclass +class TranscriptionConfig: + """ + Configuration for the transcription stage. + + Args: + whisper_model: Name of whisper model. Default: ``large-v3`` + compute_type: Computation type (``default``, ``int8``, ``int8_float16``, + ``int16``, ``float16``, ``float32``). Default: ``float16`` + device: Device for computation. Default: ``cuda`` + asr_options: Parameters for whisper model. + vad_options: Parameters for VAD model. + preprocess_options: Options for audio preprocessing. + """ + + whisper_model: str = "large-v3" + compute_type: str = "float16" + device: str = "cuda" + asr_options: dict | None = None + vad_options: dict | None = None + preprocess_options: dict | None = None + + +@dataclass +class TranslationConfig: + """ + Configuration for the translation stage. + + Args: + chatbot_model: The chatbot model to use. Can be a string like + ``'gpt-4.1-nano'`` or ``'provider:model-name'``, or a ``ModelConfig`` + instance. Default: ``gpt-4.1-nano`` + fee_limit: Maximum fee per translation call in USD. Default: ``0.8`` + consumer_thread: Number of parallel translation threads. Default: ``4`` + proxy: Proxy for API requests. e.g. ``'http://127.0.0.1:7890'`` + base_url_config: Base URL dict for OpenAI & Anthropic. + glossary: Dictionary or path mapping source words to translations. + retry_model: Fallback model for translation retries. + is_force_glossary_used: Force glossary usage in context. Default: ``False`` + """ + + chatbot_model: str | ModelConfig = "gpt-4.1-nano" + fee_limit: float = 0.8 + consumer_thread: int = 4 + proxy: str | None = None + base_url_config: dict | None = None + glossary: dict | str | Path | None = None + retry_model: str | ModelConfig | None = None + is_force_glossary_used: bool = False diff --git a/openlrc/openlrc.py b/openlrc/openlrc.py index 2aa7bd9..5830ab4 100644 --- a/openlrc/openlrc.py +++ b/openlrc/openlrc.py @@ -1,30 +1,28 @@ # Copyright (C) 2025. Hao Zheng # All rights reserved. +from __future__ import annotations + import concurrent.futures import json import shutil import traceback import warnings from copy import deepcopy -from dataclasses import dataclass from pathlib import Path from pprint import pformat from queue import Queue from threading import Lock -from typing import Any +from typing import TYPE_CHECKING, Any -from faster_whisper.transcribe import Segment +if TYPE_CHECKING: + from faster_whisper.transcribe import Segment -from openlrc.context import TranslateInfo +from openlrc.config import TranscriptionConfig, TranslationConfig from openlrc.defaults import default_asr_options, default_preprocess_options, default_vad_options from openlrc.logger import logger -from openlrc.models import ModelConfig from openlrc.opt import SubtitleOptimizer -from openlrc.preprocess import Preprocessor from openlrc.subtitle import BilingualSubtitle, Subtitle -from openlrc.transcribe import Transcriber -from openlrc.translate import LLMTranslator from openlrc.utils import ( Timer, extend_filename, @@ -38,57 +36,6 @@ _SENTINEL = object() -@dataclass -class TranscriptionConfig: - """ - Configuration for the transcription stage. - - Args: - whisper_model: Name of whisper model. Default: ``large-v3`` - compute_type: Computation type (``default``, ``int8``, ``int8_float16``, - ``int16``, ``float16``, ``float32``). Default: ``float16`` - device: Device for computation. Default: ``cuda`` - asr_options: Parameters for whisper model. - vad_options: Parameters for VAD model. - preprocess_options: Options for audio preprocessing. - """ - - whisper_model: str = "large-v3" - compute_type: str = "float16" - device: str = "cuda" - asr_options: dict | None = None - vad_options: dict | None = None - preprocess_options: dict | None = None - - -@dataclass -class TranslationConfig: - """ - Configuration for the translation stage. - - Args: - chatbot_model: The chatbot model to use. Can be a string like - ``'gpt-4.1-nano'`` or ``'provider:model-name'``, or a ``ModelConfig`` - instance. Default: ``gpt-4.1-nano`` - fee_limit: Maximum fee per translation call in USD. Default: ``0.8`` - consumer_thread: Number of parallel translation threads. Default: ``4`` - proxy: Proxy for API requests. e.g. ``'http://127.0.0.1:7890'`` - base_url_config: Base URL dict for OpenAI & Anthropic. - glossary: Dictionary or path mapping source words to translations. - retry_model: Fallback model for translation retries. - is_force_glossary_used: Force glossary usage in context. Default: ``False`` - """ - - chatbot_model: str | ModelConfig = "gpt-4.1-nano" - fee_limit: float = 0.8 - consumer_thread: int = 4 - proxy: str | None = None - base_url_config: dict | None = None - glossary: dict | str | Path | None = None - retry_model: str | ModelConfig | None = None - is_force_glossary_used: bool = False - - class LRCer: """ Orchestrator for audio/video transcription and translation. @@ -220,9 +167,11 @@ def __init__( self.transcribed_paths = [] @property - def transcriber(self) -> Transcriber: + def transcriber(self): """Lazily initialize and return the Transcriber instance (thread-safe).""" if self._transcriber is None: + from openlrc.transcribe import Transcriber + with self._transcriber_lock: if self._transcriber is None: self._transcriber = Transcriber( @@ -567,6 +516,9 @@ def _translate(self, audio_name, target_lang, transcribed_opt_sub, translated_pa This method handles the translation process, including context preparation, actual translation, and post-processing of the translated subtitles. """ + from openlrc.context import TranslateInfo + from openlrc.translate import LLMTranslator + context = TranslateInfo( title=audio_name, audio_type="Movie", glossary=self.glossary, forced_glossary=self.is_force_glossary_used ) @@ -810,6 +762,8 @@ def pre_process(self, paths, noise_suppress=False): extract_audio(path) paths[i] = audio_path + from openlrc.preprocess import Preprocessor + return Preprocessor(paths, options=self.preprocess_options).run(noise_suppress) @staticmethod diff --git a/openlrc/preprocess.py b/openlrc/preprocess.py index e9c749c..c299cad 100644 --- a/openlrc/preprocess.py +++ b/openlrc/preprocess.py @@ -4,8 +4,6 @@ from concurrent.futures import ProcessPoolExecutor from pathlib import Path -import torch -from df.enhance import enhance, init_df, load_audio, save_audio from ffmpeg_normalize import FFmpegNormalize from tqdm import tqdm @@ -63,6 +61,9 @@ def noise_suppression(self, audio_paths: list[Path], atten_lim_db: int = 15): if not audio_paths: return [] + import torch + from df.enhance import enhance, init_df, load_audio, save_audio + if "atten_lim_db" in self.options: atten_lim_db = self.options["atten_lim_db"] diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py index 67846e7..891607a 100644 --- a/tests/test_preprocess.py +++ b/tests/test_preprocess.py @@ -1,6 +1,8 @@ # Copyright (C) 2024. Hao Zheng # All rights reserved. import shutil +import sys +import types import unittest from pathlib import Path from unittest.mock import Mock, patch @@ -9,16 +11,30 @@ from openlrc.preprocess import Preprocessor +# Python 3.10's unittest.mock.patch has a bug where @patch("df.enhance.enhance") +# caches the function object `enhance` as the resolved `df.enhance`, causing +# subsequent @patch("df.enhance.save_audio") to look up `save_audio` on the +# function instead of the module (AttributeError). Fixed in Python 3.11+. +# Workaround: inject a fake df.enhance module via sys.modules and use +# patch.object() to avoid string-based path resolution entirely. +_df_enhance = types.ModuleType("df.enhance") +_df_enhance.enhance = lambda *a, **kw: None # type: ignore[attr-defined] +_df_enhance.init_df = lambda *a, **kw: None # type: ignore[attr-defined] +_df_enhance.load_audio = lambda *a, **kw: None # type: ignore[attr-defined] +_df_enhance.save_audio = lambda *a, **kw: None # type: ignore[attr-defined] +sys.modules.setdefault("df", types.ModuleType("df")) +sys.modules.setdefault("df.enhance", _df_enhance) + class TestPreprocessor(unittest.TestCase): def tearDown(self) -> None: preprocessed_path = Path("data/preprocessed") shutil.rmtree(preprocessed_path, ignore_errors=True) - @patch("openlrc.preprocess.enhance") - @patch("openlrc.preprocess.init_df") - @patch("openlrc.preprocess.load_audio") - @patch("openlrc.preprocess.save_audio") + @patch.object(_df_enhance, "enhance") + @patch.object(_df_enhance, "init_df") + @patch.object(_df_enhance, "load_audio") + @patch.object(_df_enhance, "save_audio") @patch("openlrc.preprocess.release_memory") def test_noise_suppression_returns_path_objects( self, mock_release_memory, mock_save_audio, mock_load_audio, mock_init_df, mock_enhance