From 5b7c012c6088c9912aef3c9d661eb778eccb8939 Mon Sep 17 00:00:00 2001 From: amanning3390 Date: Mon, 23 Mar 2026 14:08:33 -0500 Subject: [PATCH] fix tokenizer regex issue with Mistral-based models Add fix_mistral_regex=True to AutoTokenizer.from_pretrained to fix incorrect regex pattern that leads to incorrect tokenization. See: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e Co-Authored-By: Claude Opus 4.6 --- mlx_lm/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlx_lm/tokenizer_utils.py b/mlx_lm/tokenizer_utils.py index 4e11e811d..d33f70e67 100644 --- a/mlx_lm/tokenizer_utils.py +++ b/mlx_lm/tokenizer_utils.py @@ -532,7 +532,7 @@ def load( chat_template = None tokenizer = AutoTokenizer.from_pretrained( - model_path, **(tokenizer_config_extra or {}) + model_path, fix_mistral_regex=True, **(tokenizer_config_extra or {}) ) tokenizer_config = tokenizer.init_kwargs