Skip to content

Commit 4a80ad0

Browse files
authored
[Model Runner V2] Don't use UVA buffer for prefill_len (#29713)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
1 parent 4b17ce6 commit 4a80ad0

File tree

2 files changed

+6
-1
lines changed

2 files changed

+6
-1
lines changed

vllm/v1/worker/gpu/model_runner.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,8 @@ def update_states(self, scheduler_output: SchedulerOutput) -> None:
410410
cu_num_new_blocks[i].append(x + len(block_ids))
411411
new_block_ids[i].extend(block_ids)
412412
overwrite.append(True)
413+
if scheduler_output.scheduled_new_reqs:
414+
self.req_states.prefill_len.copy_to_gpu()
413415

414416
# Add new blocks for the existing requests.
415417
cached_reqs = scheduler_output.scheduled_cached_reqs

vllm/v1/worker/gpu/states.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,10 @@ def __init__(
117117
self.prefill_token_ids = UvaBuffer(
118118
self.max_num_reqs, self.max_model_len, dtype=torch.int32
119119
)
120-
self.prefill_len = UvaBuffer(self.max_num_reqs, dtype=torch.int32)
120+
# NOTE(woosuk): We don't use UVA for prefill_len because its GPU view
121+
# can be used outside of update_states and prepare_inputs.
122+
# Without async barrier, using UVA can cause race conditions.
123+
self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
121124
# Number of computed tokens.
122125
self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
123126
self.num_computed_tokens = torch.zeros(

0 commit comments

Comments
 (0)