From db317c358b6b893af6e281ec80459d460fb1d874 Mon Sep 17 00:00:00 2001 From: pplkit <129310466+pplkit@users.noreply.github.com> Date: Tue, 21 Oct 2025 19:27:26 +0530 Subject: [PATCH 1/2] feat: add language-aware sentence tokenization (#1269) * feat: add language-aware sentence tokenization * feat: add missing punkt languages --------- Co-authored-by: pulkit <129310466+p1kit@users.noreply.github.com> Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com> --- whisperx/alignment.py | 8 +++++--- whisperx/utils.py | 23 +++++++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/whisperx/alignment.py b/whisperx/alignment.py index 756d0ba70..81c475668 100644 --- a/whisperx/alignment.py +++ b/whisperx/alignment.py @@ -14,7 +14,7 @@ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor from whisperx.audio import SAMPLE_RATE, load_audio -from whisperx.utils import interpolate_nans +from whisperx.utils import interpolate_nans, PUNKT_LANGUAGES from whisperx.schema import ( AlignedTranscriptionResult, SingleSegment, @@ -192,11 +192,13 @@ def align( clean_wdx.append(wdx) + # Use language-specific Punkt model if available otherwise we fallback to English. + punkt_lang = PUNKT_LANGUAGES.get(model_lang, 'english') try: - sentence_splitter = nltk_load('tokenizers/punkt/english.pickle') + sentence_splitter = nltk_load(f'tokenizers/punkt_tab/{punkt_lang}.pickle') except LookupError: nltk.download('punkt_tab', quiet=True) - sentence_splitter = nltk_load('tokenizers/punkt/english.pickle') + sentence_splitter = nltk_load(f'tokenizers/punkt_tab/{punkt_lang}.pickle') sentence_spans = list(sentence_splitter.span_tokenize(text)) segment_data[sdx] = { diff --git a/whisperx/utils.py b/whisperx/utils.py index ada0deb98..2cf5090ac 100644 --- a/whisperx/utils.py +++ b/whisperx/utils.py @@ -126,6 +126,29 @@ LANGUAGES_WITHOUT_SPACES = ["ja", "zh"] +# Mapping of language codes to NLTK Punkt tokenizer model names +PUNKT_LANGUAGES = { + 'cs': 'czech', + 'da': 'danish', + 'de': 'german', + 'el': 'greek', + 'en': 'english', + 'es': 'spanish', + 'et': 'estonian', + 'fi': 'finnish', + 'fr': 'french', + 'it': 'italian', + 'nl': 'dutch', + 'no': 'norwegian', + 'pl': 'polish', + 'pt': 'portuguese', + 'sl': 'slovene', + 'sv': 'swedish', + 'tr': 'turkish', + "ml": "malayalam" + "ru": "russian", +} + system_encoding = sys.getdefaultencoding() if system_encoding != "utf-8": From d32ec3e3012ec4c0934f4088424c32f3f038b249 Mon Sep 17 00:00:00 2001 From: Barabazs <31799121+Barabazs@users.noreply.github.com> Date: Tue, 21 Oct 2025 09:13:50 -0600 Subject: [PATCH 2/2] fix: add missing comma --- whisperx/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whisperx/utils.py b/whisperx/utils.py index 2cf5090ac..8c997ce9b 100644 --- a/whisperx/utils.py +++ b/whisperx/utils.py @@ -145,7 +145,7 @@ 'sl': 'slovene', 'sv': 'swedish', 'tr': 'turkish', - "ml": "malayalam" + "ml": "malayalam", "ru": "russian", }