diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 5dc7f7859226..397e2cc84f4d 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -70,7 +70,7 @@ truncate_tool_call_ids, validate_request_params, ) -from vllm.utils import as_list +from vllm.utils import as_list, random_uuid logger = init_logger(__name__) @@ -255,10 +255,19 @@ async def create_chat_completion( except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(f"{e} {e.__cause__}") + + request_id = self._add_prefix(getattr(request, "request_id", None)) - request_id = ( - f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}" - ) + if request.kv_transfer_params: + request_id = self._add_prefix(request.kv_transfer_params.get("p_side_request_id", request_id)) + + default = request.request_id or request_id + if raw_request is None: + req_id_head = default + else: + req_id_head = raw_request.headers.get("X-Request-ID") + + raw_request_id = self._add_prefix(req_id_head) if req_id_head else request_id request_metadata = RequestResponseMetadata(request_id=request_id) if raw_request: @@ -346,7 +355,7 @@ async def create_chat_completion( return self.chat_completion_stream_generator( request, result_generator, - request_id, + raw_request_id, # the real request ID to return to user model_name, conversation, tokenizer, @@ -354,10 +363,12 @@ async def create_chat_completion( ) try: + # P side in P/D seperate always call the full generator, no need to pass internal request id to streaming generator return await self.chat_completion_full_generator( request, result_generator, - request_id, + raw_request_id, # the real request ID to return to user + request_id, # the internal vLLM request ID, pass it to D side in kv_transfer_params model_name, conversation, tokenizer, @@ -367,6 +378,13 @@ async def create_chat_completion( # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) + def _add_prefix(self, request_id, prefix="chatcmpl-"): + if request_id and not request_id.startswith(prefix): + return prefix + request_id + elif not request_id: + return prefix + random_uuid() + return request_id + def get_chat_request_role(self, request: ChatCompletionRequest) -> str: if request.add_generation_prompt: return self.response_role @@ -1251,7 +1269,8 @@ async def chat_completion_full_generator( self, request: ChatCompletionRequest, result_generator: AsyncIterator[RequestOutput], - request_id: str, + request_id: str, # the real request ID to return to user + vllm_request_id: str, # the internal vLLM request ID, pass it to D side in kv_transfer_params model_name: str, conversation: list[ConversationMessage], tokenizer: AnyTokenizer, @@ -1271,6 +1290,10 @@ async def chat_completion_full_generator( assert final_res is not None + # Pass the internal request id in P side to D side in kv_transfer_params + if final_res.kv_transfer_params: + final_res.kv_transfer_params["p_side_request_id"] = vllm_request_id + choices: list[ChatCompletionResponseChoice] = [] if self.tool_call_id_type == "kimi_k2": history_tool_call_cnt = get_history_tool_calls_cnt(conversation)