From 5b7c012c6088c9912aef3c9d661eb778eccb8939 Mon Sep 17 00:00:00 2001
From: amanning3390 <adam.manning@pro-serveinc.com>
Date: Mon, 23 Mar 2026 14:08:33 -0500
Subject: [PATCH] fix tokenizer regex issue with Mistral-based models

Add fix_mistral_regex=True to AutoTokenizer.from_pretrained to fix
incorrect regex pattern that leads to incorrect tokenization.

See: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 mlx_lm/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlx_lm/tokenizer_utils.py b/mlx_lm/tokenizer_utils.py
index 4e11e811d..d33f70e67 100644
--- a/mlx_lm/tokenizer_utils.py
+++ b/mlx_lm/tokenizer_utils.py
@@ -532,7 +532,7 @@ def load(
     chat_template = None
 
     tokenizer = AutoTokenizer.from_pretrained(
-        model_path, **(tokenizer_config_extra or {})
+        model_path, fix_mistral_regex=True, **(tokenizer_config_extra or {})
     )
 
     tokenizer_config = tokenizer.init_kwargs