Skip to content

Commit fecae12

Browse files
authored
Remove all_special_tokens_extended from tokenizer code (#29686)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
1 parent 8d9338f commit fecae12

File tree

6 files changed

+40
-65
lines changed

6 files changed

+40
-65
lines changed

tests/tokenization/test_cached_tokenizer.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
3131
# Cached attributes
3232
assert target.all_special_ids == expected.all_special_ids
3333
assert target.all_special_tokens == expected.all_special_tokens
34-
assert target.all_special_tokens_extended == expected.all_special_tokens_extended
3534
assert target.get_vocab() == expected.get_vocab()
3635
assert len(target) == len(expected)
3736

tests/tokenization/test_mistral_tokenizer.py

Lines changed: 40 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -258,52 +258,46 @@ def mistral_tokenizer(request) -> MistralTokenizer:
258258
)
259259
class TestMistralTokenizer:
260260
def test_all_special_tokens(self, mistral_tokenizer: MistralTokenizer):
261-
attributes = [
262-
mistral_tokenizer.all_special_tokens,
263-
mistral_tokenizer.all_special_tokens_extended,
264-
]
265-
266-
for attribute in attributes:
267-
if mistral_tokenizer.is_tekken:
268-
assert attribute == [
269-
"<unk>",
270-
"<s>",
271-
"</s>",
272-
"[INST]",
273-
"[/INST]",
274-
"[AVAILABLE_TOOLS]",
275-
"[/AVAILABLE_TOOLS]",
276-
"[TOOL_RESULTS]",
277-
"[/TOOL_RESULTS]",
278-
"[TOOL_CALLS]",
279-
"[IMG]",
280-
"<pad>",
281-
"[IMG_BREAK]",
282-
"[IMG_END]",
283-
"[PREFIX]",
284-
"[MIDDLE]",
285-
"[SUFFIX]",
286-
"[SYSTEM_PROMPT]",
287-
"[/SYSTEM_PROMPT]",
288-
"[TOOL_CONTENT]",
289-
] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
290-
"[ARGS]",
291-
"[CALL_ID]",
292-
"[THINK]",
293-
"[/THINK]",
294-
] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
295-
else:
296-
assert attribute == [
297-
"<s>",
298-
"</s>",
299-
"[INST]",
300-
"[/INST]",
301-
"[TOOL_CALLS]",
302-
"[AVAILABLE_TOOLS]",
303-
"[/AVAILABLE_TOOLS]",
304-
"[TOOL_RESULTS]",
305-
"[/TOOL_RESULTS]",
306-
] + [f"[control_{i}]" for i in range(8, 769)]
261+
if mistral_tokenizer.is_tekken:
262+
assert mistral_tokenizer.all_special_tokens == [
263+
"<unk>",
264+
"<s>",
265+
"</s>",
266+
"[INST]",
267+
"[/INST]",
268+
"[AVAILABLE_TOOLS]",
269+
"[/AVAILABLE_TOOLS]",
270+
"[TOOL_RESULTS]",
271+
"[/TOOL_RESULTS]",
272+
"[TOOL_CALLS]",
273+
"[IMG]",
274+
"<pad>",
275+
"[IMG_BREAK]",
276+
"[IMG_END]",
277+
"[PREFIX]",
278+
"[MIDDLE]",
279+
"[SUFFIX]",
280+
"[SYSTEM_PROMPT]",
281+
"[/SYSTEM_PROMPT]",
282+
"[TOOL_CONTENT]",
283+
] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
284+
"[ARGS]",
285+
"[CALL_ID]",
286+
"[THINK]",
287+
"[/THINK]",
288+
] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
289+
else:
290+
assert mistral_tokenizer.all_special_tokens == [
291+
"<s>",
292+
"</s>",
293+
"[INST]",
294+
"[/INST]",
295+
"[TOOL_CALLS]",
296+
"[AVAILABLE_TOOLS]",
297+
"[/AVAILABLE_TOOLS]",
298+
"[TOOL_RESULTS]",
299+
"[/TOOL_RESULTS]",
300+
] + [f"[control_{i}]" for i in range(8, 769)]
307301

308302
def get_vocab(self, mistral_tokenizer: MistralTokenizer):
309303
assert (

tests/tokenization/test_tokenizer_registry.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,6 @@ class TestTokenizer(TokenizerBase):
1515
def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
1616
return TestTokenizer()
1717

18-
@property
19-
def all_special_tokens_extended(self) -> list[str]:
20-
raise NotImplementedError()
21-
2218
@property
2319
def all_special_tokens(self) -> list[str]:
2420
raise NotImplementedError()

vllm/transformers_utils/tokenizer.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
9696

9797
tokenizer_all_special_ids = tokenizer.all_special_ids
9898
tokenizer_all_special_tokens = tokenizer.all_special_tokens
99-
tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended
10099
tokenizer_vocab = tokenizer.get_vocab()
101100
tokenizer_len = len(tokenizer)
102101

@@ -118,10 +117,6 @@ def all_special_ids(self) -> list[int]:
118117
def all_special_tokens(self) -> list[str]:
119118
return tokenizer_all_special_tokens
120119

121-
@property
122-
def all_special_tokens_extended(self) -> list[str]:
123-
return tokenizer_all_special_tokens_extended
124-
125120
@property
126121
def max_token_id(self) -> int:
127122
return max_token_id

vllm/transformers_utils/tokenizer_base.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,6 @@
1010

1111

1212
class TokenizerBase(ABC):
13-
@property
14-
@abstractmethod
15-
def all_special_tokens_extended(self) -> list[str]:
16-
raise NotImplementedError()
17-
1813
@property
1914
@abstractmethod
2015
def all_special_tokens(self) -> list[str]:

vllm/transformers_utils/tokenizers/mistral.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -254,10 +254,6 @@ def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]:
254254

255255
# the following attributes are set to fit vLLM's design and are used
256256
# by the structured output backends.
257-
@property
258-
def all_special_tokens_extended(self) -> list[str]:
259-
return self.all_special_tokens
260-
261257
@property
262258
def all_special_tokens(self) -> list[str]:
263259
return self._special_tokens

0 commit comments

Comments
 (0)