From fb97100157dd269abb85506fa7a224af2c7b3d18 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Thu, 4 Dec 2025 15:03:47 +0100 Subject: [PATCH] TokenizersBackend (from transformers v5) --- Sources/Tokenizers/Tokenizer.swift | 1 + Tests/TokenizersTests/TokenizerTests.swift | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index ad820e06..3b166daf 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -168,6 +168,7 @@ enum TokenizerModel { "LlamaTokenizer": BPETokenizer.self, "RobertaTokenizer": BPETokenizer.self, "T5Tokenizer": T5Tokenizer.self, + "TokenizersBackend": BPETokenizer.self, "PreTrainedTokenizer": BPETokenizer.self, "Qwen2Tokenizer": BPETokenizer.self, "WhisperTokenizer": BPETokenizer.self, diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index b81189a8..150abd14 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -362,4 +362,13 @@ struct TokenizerTests { #expect(tokenizer.tokenize(text: "Who are you?") == ["", "Who", "Ġare", "Ġyou", "?", ""]) #expect(tokenizer.encode(text: "Who are you?") == [0, 0, 12375, 32, 47, 116, 2, 2]) } + + @Test + func tokenizerBackend() async throws { + let tokenizerOpt = try await AutoTokenizer.from(pretrained: "mlx-community/Ministral-3-3B-Instruct-2512-4bit") as? PreTrainedTokenizer + #expect(tokenizerOpt != nil) + let tokenizer = tokenizerOpt! + + #expect(tokenizer.encode(text: "She took a train to the West") == [6284, 5244, 1261, 10018, 1317, 1278, 5046]) + } }