From db317c358b6b893af6e281ec80459d460fb1d874 Mon Sep 17 00:00:00 2001
From: pplkit <129310466+pplkit@users.noreply.github.com>
Date: Tue, 21 Oct 2025 19:27:26 +0530
Subject: [PATCH 1/2] feat: add language-aware sentence tokenization (#1269)

* feat: add language-aware sentence tokenization

* feat: add missing punkt languages

---------

Co-authored-by: pulkit <129310466+p1kit@users.noreply.github.com>
Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com>
---
 whisperx/alignment.py |  8 +++++---
 whisperx/utils.py     | 23 +++++++++++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
index 756d0ba70..81c475668 100644
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@@ -14,7 +14,7 @@
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
 from whisperx.audio import SAMPLE_RATE, load_audio
-from whisperx.utils import interpolate_nans
+from whisperx.utils import interpolate_nans, PUNKT_LANGUAGES
 from whisperx.schema import (
     AlignedTranscriptionResult,
     SingleSegment,
@@ -192,11 +192,13 @@ def align(
                 clean_wdx.append(wdx)
 
 
+        # Use language-specific Punkt model if available otherwise we fallback to English.
+        punkt_lang = PUNKT_LANGUAGES.get(model_lang, 'english')
         try:
-            sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
+            sentence_splitter = nltk_load(f'tokenizers/punkt_tab/{punkt_lang}.pickle')
         except LookupError:
             nltk.download('punkt_tab', quiet=True)
-            sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
+            sentence_splitter = nltk_load(f'tokenizers/punkt_tab/{punkt_lang}.pickle')
         sentence_spans = list(sentence_splitter.span_tokenize(text))
 
         segment_data[sdx] = {
diff --git a/whisperx/utils.py b/whisperx/utils.py
index ada0deb98..2cf5090ac 100644
--- a/whisperx/utils.py
+++ b/whisperx/utils.py
@@ -126,6 +126,29 @@
 
 LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
 
+# Mapping of language codes to NLTK Punkt tokenizer model names
+PUNKT_LANGUAGES = {
+    'cs': 'czech',
+    'da': 'danish', 
+    'de': 'german',
+    'el': 'greek',
+    'en': 'english',
+    'es': 'spanish',
+    'et': 'estonian',
+    'fi': 'finnish',
+    'fr': 'french',
+    'it': 'italian',
+    'nl': 'dutch',
+    'no': 'norwegian',
+    'pl': 'polish',
+    'pt': 'portuguese',
+    'sl': 'slovene',
+    'sv': 'swedish',
+    'tr': 'turkish',
+    "ml": "malayalam"
+    "ru": "russian",
+}
+
 system_encoding = sys.getdefaultencoding()
 
 if system_encoding != "utf-8":

From d32ec3e3012ec4c0934f4088424c32f3f038b249 Mon Sep 17 00:00:00 2001
From: Barabazs <31799121+Barabazs@users.noreply.github.com>
Date: Tue, 21 Oct 2025 09:13:50 -0600
Subject: [PATCH 2/2] fix: add missing comma

---
 whisperx/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whisperx/utils.py b/whisperx/utils.py
index 2cf5090ac..8c997ce9b 100644
--- a/whisperx/utils.py
+++ b/whisperx/utils.py
@@ -145,7 +145,7 @@
     'sl': 'slovene',
     'sv': 'swedish',
     'tr': 'turkish',
-    "ml": "malayalam"
+    "ml": "malayalam",
     "ru": "russian",
 }