markmc
diff --git a/‎tests/detokenizer/test_min_tokens.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/detokenizer/test_min_tokens.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/detokenizer/test_stop_string_while_stop_model_terminates.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/detokenizer/test_stop_string_while_stop_model_terminates.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/entrypoints/openai/test_serving_chat.py‎
Lines changed: 10 additions & 2 deletions b/‎tests/entrypoints/openai/test_serving_chat.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎tests/tokenizers_/test_detokenize.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/tokenizers_/test_detokenize.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/v1/engine/test_async_llm.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/v1/engine/test_async_llm.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/v1/engine/test_engine_core.py‎
Lines changed: 7 additions & 1 deletion b/‎tests/v1/engine/test_engine_core.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎tests/v1/engine/test_engine_core_client.py‎
Lines changed: 7 additions & 1 deletion b/‎tests/v1/engine/test_engine_core_client.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎tests/v1/engine/test_fast_incdec_prefix_err.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/v1/engine/test_fast_incdec_prefix_err.py‎
Lines changed: 1 addition & 0 deletions
@@ -35,6 +35,7 @@ def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str):
     )
     request = EngineCoreRequest(
         request_id="",
+        external_req_id="",
         prompt_token_ids=prompt_token_ids,
         mm_features=None,
         sampling_params=params,
 
@@ -31,6 +31,7 @@ def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0):
     # Keep other fields minimal for unit test purposes.
     req = EngineCoreRequest(
         request_id="test",
+        external_req_id="test-ext",
         prompt_token_ids=[],
         mm_features=None,
         sampling_params=params,
 
@@ -390,7 +390,9 @@ async def _fake_process_inputs(
         trace_headers,
         priority,
     ):
-        return dict(engine_prompt), {}
+        mock_request = MagicMock()
+        mock_request.request_id = request_id
+        return mock_request, {}
 
     serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
     return serving_chat
@@ -662,7 +664,11 @@ async def test_serving_chat_data_parallel_rank_extraction():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
+
+    mock_request = MagicMock()
+    mock_request.request_id = "test-request-internal"
     mock_engine.input_processor = MagicMock()
+    mock_engine.input_processor.process_inputs.return_value = mock_request
     mock_engine.io_processor = MagicMock()
 
     # Mock the generate method to return an async generator
@@ -689,7 +695,9 @@ async def mock_generate(*args, **kwargs):
             finished=True,
         )
 
-    mock_engine.generate = AsyncMock(side_effect=mock_generate)
+    mock_engine.generate = MagicMock(
+        side_effect=lambda *args, **kwargs: mock_generate()
+    )
 
     serving_chat = _build_serving_chat(mock_engine)
 
 
@@ -62,6 +62,7 @@ def _run_incremental_decode(
     )
     request = EngineCoreRequest(
         request_id="",
+        external_req_id="",
         prompt_token_ids=prompt_token_ids,
         mm_features=None,
         sampling_params=params,
 
@@ -253,7 +253,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):
 
         # Use multi-abort to abort multiple requests at once
         abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
-        await engine.abort(abort_request_ids)
+        await engine.abort(abort_request_ids, internal=False)
 
         # Wait for all tasks to complete
         results = await asyncio.gather(*tasks, return_exceptions=True)
@@ -548,7 +548,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
         await asyncio.sleep(0.5)
 
         # Abort the request
-        await engine.abort(request_id)
+        await engine.abort(request_id, internal=False)
 
         # Wait for generation to complete and return final output
         final_output = await generated
 
@@ -40,10 +40,16 @@
 PROMPT = "I am Gyoubu Masataka Oniwa"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
+_REQUEST_COUNTER = 0
+
 
 def make_request() -> EngineCoreRequest:
+    global _REQUEST_COUNTER
+    _REQUEST_COUNTER += 1
+    request_id = f"request-{_REQUEST_COUNTER}"
     return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
+        external_req_id=f"{request_id}-{uuid.uuid4()}",
         prompt_token_ids=PROMPT_TOKENS,
         mm_features=None,
         sampling_params=SamplingParams(),
 
@@ -39,15 +39,21 @@
 PROMPT = "Hello my name is Robert and I love quantization kernels"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
+_REQUEST_COUNTER = 0
+
 
 def make_request(
     params: SamplingParams, prompt_tokens_ids: list[int] | None = None
 ) -> EngineCoreRequest:
     if not prompt_tokens_ids:
         prompt_tokens_ids = PROMPT_TOKENS
 
+    global _REQUEST_COUNTER
+    _REQUEST_COUNTER += 1
+    request_id = f"request-{_REQUEST_COUNTER}"
     return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
+        external_req_id=f"{request_id}-{uuid.uuid4()}",
         prompt_token_ids=prompt_tokens_ids,
         mm_features=None,
         sampling_params=params,
 
@@ -27,6 +27,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
     params = SamplingParams(skip_special_tokens=True)
     request = EngineCoreRequest(
         request_id="test",
+        external_req_id="test-ext",
         prompt_token_ids=prompt_token_ids,
         mm_features=None,
         sampling_params=params,
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str):`
`35`	`35`	`)`
`36`	`36`	`request = EngineCoreRequest(`
`37`	`37`	`request_id="",`
	`38`	`+ external_req_id="",`
`38`	`39`	`prompt_token_ids=prompt_token_ids,`
`39`	`40`	`mm_features=None,`
`40`	`41`	`sampling_params=params,`
Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,7 @@ def _run_incremental_decode(`
`62`	`62`	`)`
`63`	`63`	`request = EngineCoreRequest(`
`64`	`64`	`request_id="",`
	`65`	`+ external_req_id="",`
`65`	`66`	`prompt_token_ids=prompt_token_ids,`
`66`	`67`	`mm_features=None,`
`67`	`68`	`sampling_params=params,`