[Core] Add a random suffix to frontend-provided request IDs

markmc · markmc · commit 064a69027bbb · 2025-12-01T13:24:38.000-05:00
Since #9550 and #10968 we support client's supplying a custom request ID. The motivation for this is that it can be very helpful when you need to correlate vLLM logs with logs of a related service. Since the request ID is used ubiquitously across vLLM as a unique key, it obviously is problematic if we ever have multiple in-flight requests using the same client-provided request ID. We saw this happening recently when `vllm serve bench` started including a request ID and the request IDs from multiple concurrent instances caused collisions. See #27723 We try to guard against request ID collisions currently in the frontend in OutputProcessor: ``` def add_request(...): if request_id in self.request_states: raise ValueError(f"Request id {request_id} already running.") ``` however, this is not always effective: 1) We can have abort race conditions where a request is no longer tracked by the frontend, but still not completed in the engine. See #15326 for an attempt to fix this. 2) We can have async scheduling race conditions where a request ID is removed from the output processor and being scheduled while the older request with that ID is still being completed by the model runner. See #29355 3) With P/D, a request will continue to be tracked by the prefill engine long after the prefill request has been completed in the frontend, while we wait for the decode side to fetch the KV blocks. See #20139 Let's instead ensure we use a unique request ID internally, even when a client provides a custom request ID. We can do this simply by appending a short random suffix to any request ID provided by the frontend. We need to ensure we track the external->internal request ID mapping because abort() will be supplied an external request ID. In the case where an external request ID maps to multiple running requests, we assume the caller requires all of those requests to be aborted. The caller can use EngineCoreRequest.request_id as the request ID if they want to be more specific. A full 32 character random UUID would be overkill as a suffix, so how many random characters would be sufficient? 8 characters gives us 32 bits of entropy, or 16^8 possible prefixes. Using the collision probability approximation from https://preshing.com/20110504/hash-collision-probabilities: N = 16^8 and k is the number of generated suffixes, then the probability of collision is (k^2)/(2N), so If a client somehow caused vLLM to hold 10k requests that reuse the same client-provided ID, then there would be a 1.16% chance of collision: ``` >>> (k**2)/(2*N) 0.011641532182693481 ``` That seems (super good enough)[https://hownot2.com/products/hownot2-super-good-enough-t-shirt]. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
diff --git a/tests/tokenizers_/test_detokenize.py b/tests/tokenizers_/test_detokenize.py
@@ -62,6 +62,7 @@ def _run_incremental_decode(
     )
     request = EngineCoreRequest(
         request_id="",
+        external_req_id="",
         prompt_token_ids=prompt_token_ids,
         mm_features=None,
         sampling_params=params,
diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -6,6 +6,7 @@
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.multimodal import MultiModalUUIDDict
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import input_processor as input_processor_mod
 from vllm.v1.engine.input_processor import InputProcessor
@@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
         monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
     )
 
-    captured: dict[str, object] = {}
+    captured: dict[str, MultiModalUUIDDict] = {}
 
     def fake_preprocess(
         prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
@@ -196,7 +197,16 @@ def fake_preprocess(
     )
 
     # Expect request-id-based overrides are passed through
-    assert captured["mm_uuids"] == {
-        "image": [f"{request_id}-image-0", f"{request_id}-image-1"],
-        "video": [f"{request_id}-video-0"],
-    }
+    mm_uuids = captured["mm_uuids"]
+    assert set(mm_uuids.keys()) == {"image", "video"}
+    assert len(mm_uuids["image"]) == 2
+    assert len(mm_uuids["video"]) == 1
+    assert mm_uuids["image"][0].startswith(f"{request_id}-") and mm_uuids["image"][
+        0
+    ].endswith("-image-0")
+    assert mm_uuids["image"][1].startswith(f"{request_id}-") and mm_uuids["image"][
+        1
+    ].endswith("-image-1")
+    assert mm_uuids["video"][0].startswith(f"{request_id}-") and mm_uuids["video"][
+        0
+    ].endswith("-video-0")
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -36,6 +36,7 @@
     has_kv_transfer_group,
 )
 from vllm.forward_context import ForwardContext
+from vllm.outputs import RequestOutput
 from vllm.platforms.interface import Platform
 from vllm.sampling_params import SamplingParams
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
@@ -1077,24 +1078,32 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
         0
     ].req_to_blocks
 
+    def req_id(outputs: list[RequestOutput]):
+        assert len(outputs) == 1
+        return outputs[0].request_id
+
     padding = "Just making this request a little longer so that we're sure "
     "we're not hitting the small-request lower bound beneath which we don't "
     "actually trigger the whole kv transfer, but rather just recompute the "
     "blocks on D."
-    _ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
+    req0_id = req_id(
+        llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
+    )
 
     # Request finished but not freed
-    assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks
+    assert req0_id in scheduler.finished_req_ids and req0_id in req_to_blocks
     # Some other request, 0 still not freed
-    _ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
-    assert "0" in req_to_blocks
-    assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks
+    req1_id = req_id(
+        llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
+    )
+    assert req0_id in req_to_blocks
+    assert req1_id in scheduler.finished_req_ids and req1_id in req_to_blocks
 
     # Wait for timeout and trigger another scheduler loop
     time.sleep(timeout)
     _ = llm.generate([f"What is the capital of France? {padding}"], sampling_params)
     # Request-0 times out and is cleared!
-    assert "0" not in req_to_blocks
+    assert req0_id not in req_to_blocks
     # Need to shutdown the background thread to release NIXL side channel port
     llm.llm_engine.engine_core.shutdown()
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -1700,15 +1700,30 @@ def _add_request(
         )
 
         self.llm_engine.add_request(
-            request_id,
+            engine_request.request_id,
             engine_request,
             params,
             lora_request=lora_request,
             tokenization_kwargs=tokenization_kwargs,
             priority=priority,
             prompt_text=prompt_text,
         )
-        return request_id
+        return engine_request.request_id
+
+    @staticmethod
+    def _sort_outputs(
+        outputs: list[RequestOutput | PoolingRequestOutput],
+    ) -> list[RequestOutput | PoolingRequestOutput]:
+        # Sort the outputs by request ID.
+        # This is necessary because some requests may be finished earlier than
+        # its previous requests.
+
+        # Extract the original request ID prefix for sorting.
+        # See how InputProcessor._generate_request_id() adds a random suffix
+        def extract_request_id_prefix(request_id: str) -> int:
+            return int(request_id.rsplit("-", 1)[0])
+
+        return sorted(outputs, key=lambda x: extract_request_id_prefix(x.request_id))
 
     def _run_engine(
         self, *, use_tqdm: bool | Callable[..., tqdm] = True
@@ -1756,7 +1771,5 @@ def _run_engine(
 
         if use_tqdm:
             pbar.close()
-        # Sort the outputs by request ID.
-        # This is necessary because some requests may be finished earlier than
-        # its previous requests.
-        return sorted(outputs, key=lambda x: int(x.request_id))
+
+        return self._sort_outputs(outputs)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -341,7 +341,7 @@ async def create_chat_completion(
                     generator = self.engine_client.generate(
                         engine_request,
                         sampling_params,
-                        sub_request_id,
+                        engine_request.request_id,
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -231,7 +231,7 @@ async def create_completion(
                     generator = self.engine_client.generate(
                         engine_request,
                         sampling_params,
-                        request_id_item,
+                        engine_request.request_id,
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
@@ -1260,7 +1260,7 @@ async def _generate_with_builtin_tools(
             generator = self.engine_client.generate(
                 engine_request,
                 sampling_params,
-                sub_request_id,
+                engine_request.request_id,
                 lora_request=lora_request,
                 priority=priority,
                 prompt_text=prompt_text,
diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py
@@ -536,8 +536,8 @@ async def _collect_batch(
                     # Non-chunked result - extract prompt_idx from request_id
                     parts = result.request_id.split("-")
                     try:
-                        # Last part should be prompt index
-                        prompt_idx = int(parts[-1])
+                        # Second-to-last part should be prompt index
+                        prompt_idx = int(parts[-2])
                     except (ValueError, IndexError):
                         prompt_idx = result_idx  # Fallback to result_idx
 
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
@@ -49,6 +49,7 @@ class EngineCoreRequest(
     gc=False,
 ):  # type: ignore[call-arg]
     request_id: str
+    external_req_id: str
     prompt_token_ids: list[int] | None
     mm_features: list[MultiModalFeatureSpec] | None
     sampling_params: SamplingParams | None
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -304,6 +304,12 @@ async def add_request(
         # Convert Input --> Request.
         if isinstance(prompt, EngineCoreRequest):
             request = prompt
+            if request_id != request.request_id:
+                logger.warning_once(
+                    "AsyncLLM.add_request() was passed a request_id parameter that "
+                    "does not match the EngineCoreRequest.request_id attribute. The "
+                    "latter will be used, and the former will be ignored."
+                )
         else:
             assert prompt_text is None
             request = self.input_processor.process_inputs(
@@ -333,7 +339,7 @@ async def add_request(
         assert isinstance(parent_params, SamplingParams)
 
         # Fan out child requests (for n>1).
-        parent_request = ParentRequest(request_id, parent_params)
+        parent_request = ParentRequest(request.request_id, parent_params)
         for idx in range(parent_params.n):
             request_id, child_params = parent_request.get_child_info(idx)
             child_request = request if idx == parent_params.n - 1 else copy(request)
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
@@ -20,7 +20,7 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
-from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import MultiModalCacheStats
 from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
@@ -382,6 +382,12 @@ def _extract_mm_data(p: PromptType):
             mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
         return mm_uuids
 
+    def _generate_request_id(self, request_id: str):
+        """Construct an internal request ID by adding 8 random characters
+        to the supplied request ID in order to ensure uniquness.
+        """
+        return f"{request_id}-{random_uuid()[:8]}"
+
     def process_inputs(
         self,
         request_id: str,
@@ -409,6 +415,9 @@ def process_inputs(
         if arrival_time is None:
             arrival_time = time.time()
 
+        external_req_id = request_id
+        request_id = self._generate_request_id(request_id)
+
         # Optionally generate multimodal hash overrides to avoid hashing
         # multimodal data items by their content as their identifiers.
 
@@ -509,6 +518,7 @@ def process_inputs(
 
         return EngineCoreRequest(
             request_id=request_id,
+            external_req_id=external_req_id,
             prompt_token_ids=prompt_token_ids,
             prompt_embeds=prompt_embeds,
             mm_features=mm_features,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
@@ -239,6 +239,12 @@ def add_request(
         # Process raw inputs into the request.
         if isinstance(prompt, EngineCoreRequest):
             request = prompt
+            if request_id != request.request_id:
+                logger.warning_once(
+                    "AsyncLLM.add_request() was passed a request_id parameter that "
+                    "does not match the EngineCoreRequest.request_id attribute. The "
+                    "latter will be used, and the former will be ignored."
+                )
         else:
             assert prompt_text is None
             request = self.input_processor.process_inputs(
@@ -269,7 +275,7 @@ def add_request(
             return
 
         # Fan out child requests (for n>1).
-        parent_req = ParentRequest(request_id, params)
+        parent_req = ParentRequest(request.request_id, params)
         for idx in range(n):
             request_id, child_params = parent_req.get_child_info(idx)
             child_request = request if idx == n - 1 else copy(request)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
@@ -91,6 +91,7 @@ class RequestState:
     def __init__(
         self,
         request_id: str,
+        external_req_id: str,
         parent_req: ParentRequest | None,
         request_index: int,
         lora_name: str | None,
@@ -110,6 +111,7 @@ def __init__(
         temperature: float | None = None,
     ):
         self.request_id = request_id
+        self.external_req_id = external_req_id
         self.parent_req = parent_req
         self.request_index = request_index
         self.lora_name = lora_name
@@ -176,6 +178,7 @@ def from_new_request(
 
         return cls(
             request_id=request.request_id,
+            external_req_id=request.external_req_id,
             parent_req=parent_req,
             request_index=request_index,
             lora_name=(
@@ -351,6 +354,7 @@ def __init__(
         self.stream_interval = stream_interval
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
+        self.external_req_ids: dict[str, list[str]] = {}
         self.lora_states = LoRARequestStates(log_stats)
         self.tracer: Tracer | None = None
         self._requests_drained = asyncio.Event()
@@ -378,8 +382,24 @@ def abort_requests(
         self,
         request_ids: Iterable[str],
     ) -> list[str]:
-        request_ids_to_abort = []
+        internal_req_ids = []
         for request_id in request_ids:
+            # External ID - abort all requests in the external->internal mapping
+            if request_id in self.external_req_ids:
+                internal_req_ids.extend(self.external_req_ids.pop(request_id))
+                continue
+
+            # Internal ID - remove it from the external->internal mapping
+            if request_id not in self.request_states:
+                continue
+            internal_req_ids.append(request_id)
+            external_req_id = self.request_states[request_id].external_req_id
+            self.external_req_ids[external_req_id].remove(request_id)
+            if not self.external_req_ids[external_req_id]:
+                del self.external_req_ids[external_req_id]
+
+        request_ids_to_abort = []
+        for request_id in internal_req_ids:
             req_state = self.request_states.pop(request_id, None)
             if req_state is not None:
                 self.lora_states.request_finished(request_id, req_state.lora_name)
@@ -438,6 +458,11 @@ def add_request(
         if parent_req:
             self.parent_requests[parent_req.request_id] = parent_req
 
+        # Track the external_req_id -> [internal_req_id, ...] mapping
+        if req_state.external_req_id not in self.external_req_ids:
+            self.external_req_ids[req_state.external_req_id] = []
+        self.external_req_ids[req_state.external_req_id].append(request_id)
+
     def process_outputs(
         self,
         engine_core_outputs: list[EngineCoreOutput],
@@ -521,6 +546,10 @@ def process_outputs(
             # Free completed requests.
             if finish_reason is not None:
                 self.request_states.pop(req_id)
+                self.external_req_ids[req_state.external_req_id].remove(req_id)
+                if not self.external_req_ids[req_state.external_req_id]:
+                    del self.external_req_ids[req_state.external_req_id]
+
                 # Remove parent request if applicable.
                 parent_req = req_state.parent_req
                 if parent_req and not parent_req.child_requests:

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,7 @@ def _run_incremental_decode(`
`62`	`62`	`)`
`63`	`63`	`request = EngineCoreRequest(`
`64`	`64`	`request_id="",`
	`65`	`+ external_req_id="",`
`65`	`66`	`prompt_token_ids=prompt_token_ids,`
`66`	`67`	`mm_features=None,`
`67`	`68`	`sampling_params=params,`