From 18436d1cb1c7b75b90568f0832ca5d4ce61ffae4 Mon Sep 17 00:00:00 2001 From: Yao Yunxiang Date: Wed, 15 Oct 2025 14:08:15 +0800 Subject: [PATCH 1/3] force vllm to rewrite request id in chat api, to avoid collisions modify the code to make it more clear and safer --- vllm/entrypoints/openai/serving_chat.py | 35 ++++++++++++++++++++----- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 5dc7f7859226..61a07e69ac1f 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -70,7 +70,7 @@ truncate_tool_call_ids, validate_request_params, ) -from vllm.utils import as_list +from vllm.utils import as_list, random_uuid logger = init_logger(__name__) @@ -256,9 +256,18 @@ async def create_chat_completion( logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(f"{e} {e.__cause__}") - request_id = ( - f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}" - ) + request_id = self._ensure_prefix(request.request_id or random_uuid()) + + if request.kv_transfer_params: + request_id = self._ensure_prefix(request.kv_transfer_params.get("p_side_request_id", request_id)) + + default = request.request_id or request_id + if raw_request is None: + req_id_head = default + else: + req_id_head = raw_request.headers.get("X-Request-ID") + + raw_request_id = self._ensure_prefix(req_id_head) if req_id_head else request_id request_metadata = RequestResponseMetadata(request_id=request_id) if raw_request: @@ -346,7 +355,7 @@ async def create_chat_completion( return self.chat_completion_stream_generator( request, result_generator, - request_id, + raw_request_id, # the real request ID to return to user model_name, conversation, tokenizer, @@ -354,10 +363,12 @@ async def create_chat_completion( ) try: + # P side in P/D seperate always call the full generator, no need to pass internal request id to streaming generator return await self.chat_completion_full_generator( request, result_generator, - request_id, + raw_request_id, # the real request ID to return to user + request_id, # the internal vLLM request ID, pass it to D side in kv_transfer_params model_name, conversation, tokenizer, @@ -367,6 +378,11 @@ async def create_chat_completion( # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) + def _ensure_prefix(self, rid, prefix="chatcmpl-"): + if rid and not rid.startswith(prefix): + return prefix + rid + return rid + def get_chat_request_role(self, request: ChatCompletionRequest) -> str: if request.add_generation_prompt: return self.response_role @@ -1251,7 +1267,8 @@ async def chat_completion_full_generator( self, request: ChatCompletionRequest, result_generator: AsyncIterator[RequestOutput], - request_id: str, + request_id: str, # the real request ID to return to user + vllm_request_id: str, # the internal vLLM request ID, pass it to D side in kv_transfer_params model_name: str, conversation: list[ConversationMessage], tokenizer: AnyTokenizer, @@ -1271,6 +1288,10 @@ async def chat_completion_full_generator( assert final_res is not None + # Pass the internal request id in P side to D side in kv_transfer_params + if final_res.kv_transfer_params: + final_res.kv_transfer_params["p_side_request_id"] = vllm_request_id + choices: list[ChatCompletionResponseChoice] = [] if self.tool_call_id_type == "kimi_k2": history_tool_call_cnt = get_history_tool_calls_cnt(conversation) From 8e5fb77c694581a4c7ddd07a97ebc228808f2758 Mon Sep 17 00:00:00 2001 From: Yao Yunxiang Date: Wed, 15 Oct 2025 15:15:36 +0800 Subject: [PATCH 2/3] fix the code logic --- vllm/entrypoints/openai/serving_chat.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 61a07e69ac1f..2919856bef6b 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -256,10 +256,10 @@ async def create_chat_completion( logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(f"{e} {e.__cause__}") - request_id = self._ensure_prefix(request.request_id or random_uuid()) + request_id = self._add_prefix(request.request_id or random_uuid()) if request.kv_transfer_params: - request_id = self._ensure_prefix(request.kv_transfer_params.get("p_side_request_id", request_id)) + request_id = self._add_prefix(request.kv_transfer_params.get("p_side_request_id", request_id)) default = request.request_id or request_id if raw_request is None: @@ -267,7 +267,7 @@ async def create_chat_completion( else: req_id_head = raw_request.headers.get("X-Request-ID") - raw_request_id = self._ensure_prefix(req_id_head) if req_id_head else request_id + raw_request_id = self._add_prefix(req_id_head) if req_id_head else request_id request_metadata = RequestResponseMetadata(request_id=request_id) if raw_request: @@ -378,10 +378,12 @@ async def create_chat_completion( # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) - def _ensure_prefix(self, rid, prefix="chatcmpl-"): - if rid and not rid.startswith(prefix): - return prefix + rid - return rid + def _add_prefix(self, request_id, prefix="chatcmpl-"): + if request_id and not request_id.startswith(prefix): + return prefix + request_id + elif not request_id: + return prefix + random_uuid() + return request_id def get_chat_request_role(self, request: ChatCompletionRequest) -> str: if request.add_generation_prompt: From 592a02910a053788aa981572ef5080b6153f5dd7 Mon Sep 17 00:00:00 2001 From: Yao Yunxiang Date: Wed, 15 Oct 2025 15:24:07 +0800 Subject: [PATCH 3/3] fix the code logic --- vllm/entrypoints/openai/serving_chat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 2919856bef6b..397e2cc84f4d 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -255,8 +255,8 @@ async def create_chat_completion( except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(f"{e} {e.__cause__}") - - request_id = self._add_prefix(request.request_id or random_uuid()) + + request_id = self._add_prefix(getattr(request, "request_id", None)) if request.kv_transfer_params: request_id = self._add_prefix(request.kv_transfer_params.get("p_side_request_id", request_id))