Skip to content

Commit 67fc16c

Browse files
authored
[Bugfix] If chunked_prefill is disabled, end the scheduling early. (#28911)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
1 parent 6330f94 commit 67fc16c

File tree

3 files changed

+33
-4
lines changed

3 files changed

+33
-4
lines changed

tests/v1/core/test_scheduler.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,34 @@ def test_schedule_concurrent_batches(
641641
scheduler.update_from_output(scheduler_output1, model_runner_output)
642642

643643

644+
@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
645+
def test_schedule_order(enable_chunked_prefill: bool):
646+
scheduler = create_scheduler(
647+
max_num_batched_tokens=1024,
648+
max_num_seqs=3,
649+
enable_chunked_prefill=enable_chunked_prefill,
650+
)
651+
652+
# long requests
653+
requests = create_requests(num_requests=2, num_tokens=800)
654+
# short requests
655+
requests += create_requests(num_requests=2, num_tokens=10)
656+
657+
for request in requests:
658+
scheduler.add_request(request)
659+
660+
scheduler_output1 = scheduler.schedule()
661+
662+
if enable_chunked_prefill:
663+
# When enable chunked prefill, long requests will be chunked.
664+
assert len(scheduler_output1.scheduled_new_reqs) == 2
665+
else:
666+
# When disable chunked prefill, should not skip the long requests,
667+
# and scheduling subsequent short requests in advance,
668+
# even though there is still token budgets remaining.
669+
assert len(scheduler_output1.scheduled_new_reqs) == 1
670+
671+
644672
def test_preempt_during_execution():
645673
# NOTE(woosuk): The actual number of available blocks is 10 instead of 11
646674
# because block 0 is reserved as the null block.

tests/v1/core/utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def create_scheduler(
4242
model: str = "facebook/opt-125m",
4343
max_num_seqs: int = 16,
4444
max_num_batched_tokens: int = 8192,
45+
enable_chunked_prefill: bool = True,
4546
enable_prefix_caching: bool = False,
4647
long_prefill_token_threshold: int = 0,
4748
disable_chunked_mm_input: bool = False,
@@ -76,7 +77,7 @@ def create_scheduler(
7677
max_model_len=max_model_len,
7778
long_prefill_token_threshold=long_prefill_token_threshold,
7879
disable_chunked_mm_input=disable_chunked_mm_input,
79-
enable_chunked_prefill=True,
80+
enable_chunked_prefill=enable_chunked_prefill,
8081
async_scheduling=async_scheduling,
8182
)
8283
model_config = ModelConfig(

vllm/v1/core/sched/scheduler.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -508,9 +508,9 @@ def schedule(self) -> SchedulerOutput:
508508
not self.scheduler_config.enable_chunked_prefill
509509
and num_new_tokens > token_budget
510510
):
511-
self.waiting.pop_request()
512-
skipped_waiting_requests.prepend_request(request)
513-
continue
511+
# If chunked_prefill is disabled,
512+
# we can stop the scheduling here.
513+
break
514514

515515
num_new_tokens = min(num_new_tokens, token_budget)
516516
assert num_new_tokens > 0

0 commit comments

Comments
 (0)