From 5cdbdb11ddc9b3f88efd563ed7c6ae51cb12e505 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 19 Nov 2025 14:11:33 -0800 Subject: [PATCH 1/4] [BugFix] Fix malformed output with streaming responses API Signed-off-by: Nick Hill --- vllm/entrypoints/context.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 7a41c668d764..4dbb596e3f01 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -214,7 +214,6 @@ def _update_num_reasoning_tokens(self): def append_output(self, output: RequestOutput) -> None: output_token_ids = output.outputs[0].token_ids - self.parser = get_streamable_parser_for_assistant() for token_id in output_token_ids: self.parser.process(token_id) # Check if the current token is part of reasoning content @@ -519,7 +518,8 @@ def append_output(self, output: RequestOutput) -> None: # (finished=True), then the next token processed will mark the # beginning of a new message self.first_tok_of_message = output.finished - for tok in output.outputs[0].token_ids: + token_ids = output.outputs[0].token_ids + for tok in token_ids: self.parser.process(tok) self._update_decode_token_usage(output) @@ -529,7 +529,9 @@ def append_output(self, output: RequestOutput) -> None: self.current_turn_metrics.reset() # Check if the current token is part of reasoning content self._update_num_reasoning_tokens() - self.last_tok = tok + # Only update last_tok if we actually processed tokens + if token_ids: + self.last_tok = tok if len(self._messages) - self.num_init_messages < len(self.parser.messages): self._messages.extend( self.parser.messages[len(self._messages) - self.num_init_messages :] @@ -547,7 +549,8 @@ def append_tool_output(self, output: list[Message]) -> None: for tok in toks: self.parser.process(tok) self.last_tok = toks[-1] - # TODO: add tool_output messages to self._messages + # Add tool output messages to self._messages + self._messages.extend(output) def is_expecting_start(self) -> bool: return self.parser.state == StreamState.EXPECT_START From f04f01b2f4eb00ebfdeaffb21c8cf8d29d6409b4 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 19 Nov 2025 17:03:40 -0800 Subject: [PATCH 2/4] fix multi-turn MCP Signed-off-by: Nick Hill --- vllm/entrypoints/context.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 4dbb596e3f01..d0616e78b335 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -214,6 +214,9 @@ def _update_num_reasoning_tokens(self): def append_output(self, output: RequestOutput) -> None: output_token_ids = output.outputs[0].token_ids + # Reset parser for each append_output call to handle multi-turn scenarios + # where the parser needs to start fresh for each assistant response + self.parser = get_streamable_parser_for_assistant() for token_id in output_token_ids: self.parser.process(token_id) # Check if the current token is part of reasoning content From 4d59d5e31b5cd1d87dd31aff39edddcb2da4dfe2 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 19 Nov 2025 20:00:18 -0800 Subject: [PATCH 3/4] next fix from claude Signed-off-by: Nick Hill --- vllm/entrypoints/context.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index d0616e78b335..615a2a3b5030 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -552,8 +552,11 @@ def append_tool_output(self, output: list[Message]) -> None: for tok in toks: self.parser.process(tok) self.last_tok = toks[-1] - # Add tool output messages to self._messages - self._messages.extend(output) + # Add tool output messages from parser to self._messages + # (same pattern as append_output) + msg_count = len(self._messages) - self.num_init_messages + if msg_count < len(self.parser.messages): + self._messages.extend(self.parser.messages[msg_count:]) def is_expecting_start(self) -> bool: return self.parser.state == StreamState.EXPECT_START From 9ae585f248565524bf90012df3cbe3b788de8b6c Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 19 Nov 2025 22:00:13 -0800 Subject: [PATCH 4/4] next fix from claude Signed-off-by: Nick Hill --- vllm/entrypoints/context.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 615a2a3b5030..e907d6f8a6d8 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -506,6 +506,8 @@ def __init__(self, *args, **kwargs): self.encoding = get_encoding() self.last_tok = None self.first_tok_of_message = True + # Track how many tokens have been processed to avoid buggy token search + self.processed_token_count = 0 @property def messages(self) -> list: @@ -524,6 +526,7 @@ def append_output(self, output: RequestOutput) -> None: token_ids = output.outputs[0].token_ids for tok in token_ids: self.parser.process(tok) + self.processed_token_count += 1 self._update_decode_token_usage(output) # For streaming, update previous turn when message is complete @@ -551,6 +554,7 @@ def append_tool_output(self, output: list[Message]) -> None: toks = self.encoding.render(msg) for tok in toks: self.parser.process(tok) + self.processed_token_count += 1 self.last_tok = toks[-1] # Add tool output messages from parser to self._messages # (same pattern as append_output) @@ -565,17 +569,15 @@ def is_assistant_action_turn(self) -> bool: return self.last_tok in self.encoding.stop_tokens_for_assistant_actions() def render_for_completion(self) -> list[int]: - # now this list of tokens as next turn's starting tokens - # `<|start|>assistant`, - # we need to process them in parser. + # Render all messages including the new turn start tokens + # e.g. [...previous tokens...] [<|start|>] [assistant] rendered_tokens = super().render_for_completion() - last_n = -1 - to_process = [] - while rendered_tokens[last_n] != self.last_tok: - to_process.append(rendered_tokens[last_n]) - last_n -= 1 - for tok in reversed(to_process): + # Process only the NEW tokens that we haven't seen before + # This avoids the buggy token search that could match at wrong positions + to_process = rendered_tokens[self.processed_token_count :] + for tok in to_process: self.parser.process(tok) + self.processed_token_count += 1 return rendered_tokens